From a7f495f170864e6bddc4bb29ae7fae293a7136aa Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Tue, 10 Jun 2025 13:30:31 -0700 Subject: [PATCH 001/851] [lldb] Revive TestSimulatorPlatform.py (#142244) This test was incorrectly disabled and bitrotted since then. This PR fixes up the test and re-enables it. - Build against the system libc++ (which can target the simulator) - Bump the deployment target for iOS and tvOS on Apple Silicon - Skip backdeploying to pre-Apple Silicon OS on Apple Silicon. --- .../Python/lldbsuite/test/decorators.py | 54 +++++++++++++++++-- .../macosx/simulator/TestSimulatorPlatform.py | 12 +++-- 2 files changed, 57 insertions(+), 9 deletions(-) diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py index 868e9f7e5eca0..a391319ca9b0e 100644 --- a/lldb/packages/Python/lldbsuite/test/decorators.py +++ b/lldb/packages/Python/lldbsuite/test/decorators.py @@ -9,6 +9,7 @@ import sys import tempfile import subprocess +import json # Third-party modules import unittest @@ -451,24 +452,67 @@ def apple_simulator_test(platform): """ Decorate the test as a test requiring a simulator for a specific platform. - Consider that a simulator is available if you have the corresponding SDK installed. - The SDK identifiers for simulators are iphonesimulator, appletvsimulator, watchsimulator + Consider that a simulator is available if you have the corresponding SDK + and runtime installed. + + The SDK identifiers for simulators are iphonesimulator, appletvsimulator, + watchsimulator """ def should_skip_simulator_test(): if lldbplatformutil.getHostPlatform() not in ["darwin", "macosx"]: return "simulator tests are run only on darwin hosts." + + # Make sure we recognize the platform. + mapping = { + "iphone": "ios", + "appletv": "tvos", + "watch": "watchos", + } + if platform not in mapping: + return "unknown simulator platform: {}".format(platform) + + # Make sure we have an SDK. try: output = subprocess.check_output( ["xcodebuild", "-showsdks"], stderr=subprocess.DEVNULL ).decode("utf-8") - if re.search("%ssimulator" % platform, output): - return None - else: + if not re.search("%ssimulator" % platform, output): return "%s simulator is not supported on this system." % platform except subprocess.CalledProcessError: return "Simulators are unsupported on this system (xcodebuild failed)" + # Make sure we a simulator runtime. + try: + sim_devices_str = subprocess.check_output( + ["xcrun", "simctl", "list", "-j", "devices"] + ).decode("utf-8") + + sim_devices = json.loads(sim_devices_str)["devices"] + for simulator in sim_devices: + if isinstance(simulator, dict): + runtime = simulator["name"] + devices = simulator["devices"] + else: + runtime = simulator + devices = sim_devices[simulator] + + if not mapping[platform] in runtime.lower(): + continue + + for device in devices: + if ( + "availability" in device + and device["availability"] == "(available)" + ): + return None + if "isAvailable" in device and device["isAvailable"]: + return None + + return "{} simulator is not supported on this system.".format(platform) + except (subprocess.CalledProcessError, json.decoder.JSONDecodeError): + return "Simulators are unsupported on this system (simctl failed)" + return skipTestIfFn(should_skip_simulator_test) diff --git a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py index faf2256b03a0d..74ba0ee6c83bb 100644 --- a/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py +++ b/lldb/test/API/macosx/simulator/TestSimulatorPlatform.py @@ -39,15 +39,15 @@ def check_debugserver(self, log, expected_platform, expected_version): if expected_version: self.assertEqual(aout_info["min_version_os_sdk"], expected_version) - @skipIf(bugnumber="rdar://76995109") def run_with(self, arch, os, vers, env, expected_load_command): env_list = [env] if env else [] triple = "-".join([arch, "apple", os + vers] + env_list) sdk = lldbutil.get_xcode_sdk(os, env) - version_min = "" if not vers: vers = lldbutil.get_xcode_sdk_version(sdk) + + version_min = "" if env == "simulator": version_min = "-m{}-simulator-version-min={}".format(os, vers) elif os == "macosx": @@ -56,11 +56,14 @@ def run_with(self, arch, os, vers, env, expected_load_command): sdk_root = lldbutil.get_xcode_sdk_root(sdk) clang = lldbutil.get_xcode_clang(sdk) + print(triple) + self.build( dictionary={ "ARCH": arch, "ARCH_CFLAGS": "-target {} {}".format(triple, version_min), "SDKROOT": sdk_root, + "USE_SYSTEM_STDLIB": 1, }, compiler=clang, ) @@ -146,6 +149,7 @@ def test_watchos_armv7k(self): @skipUnlessDarwin @skipIfDarwinEmbedded + @skipIf(archs=["arm64", "arm64e"]) def test_lc_version_min_macosx(self): """Test running a back-deploying non-simulator MacOS X binary""" self.run_with( @@ -198,7 +202,7 @@ def test_ios_backdeploy_apple_silicon(self): self.run_with( arch=self.getArchitecture(), os="ios", - vers="11.0", + vers="14.0", env="simulator", expected_load_command="LC_BUILD_VERSION", ) @@ -229,7 +233,7 @@ def test_tvos_backdeploy_apple_silicon(self): self.run_with( arch=self.getArchitecture(), os="tvos", - vers="11.0", + vers="14.0", env="simulator", expected_load_command="LC_BUILD_VERSION", ) From d7282c56cd294a2eb4890e50c84e6eae6f7c6671 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Tue, 10 Jun 2025 23:34:26 +0300 Subject: [PATCH 002/851] [llvm-rc] Add support for multiplication and division in expressions (#143373) This is supported by GNU windres. MS rc.exe does accept these expressions, but doesn't evalulate them correctly, it only returns the left hand side. This fixes one aspect of https://github.com/llvm/llvm-project/issues/143157. --- llvm/test/tools/llvm-rc/Inputs/parser-expr.rc | 5 ++ llvm/test/tools/llvm-rc/Inputs/tokens.rc | 1 + llvm/test/tools/llvm-rc/parser-expr.test | 5 ++ llvm/test/tools/llvm-rc/tokenizer.test | 5 ++ llvm/tools/llvm-rc/ResourceScriptParser.cpp | 49 ++++++++++++++----- llvm/tools/llvm-rc/ResourceScriptParser.h | 1 + llvm/tools/llvm-rc/ResourceScriptStmt.h | 24 +++++++++ llvm/tools/llvm-rc/ResourceScriptToken.cpp | 12 ++++- llvm/tools/llvm-rc/ResourceScriptToken.h | 7 ++- .../tools/llvm-rc/ResourceScriptTokenList.def | 2 + 10 files changed, 97 insertions(+), 14 deletions(-) diff --git a/llvm/test/tools/llvm-rc/Inputs/parser-expr.rc b/llvm/test/tools/llvm-rc/Inputs/parser-expr.rc index 8e69c1cd1fa16..2f8e4b2d344a0 100644 --- a/llvm/test/tools/llvm-rc/Inputs/parser-expr.rc +++ b/llvm/test/tools/llvm-rc/Inputs/parser-expr.rc @@ -5,6 +5,11 @@ LANGUAGE 1|1&0, 0&0|1 LANGUAGE 3+4-5, 3-4+5 LANGUAGE 1+2|3, 3|1+2 LANGUAGE 6&~5, 6&-8 +LANGUAGE 7/3, 7*3 +LANGUAGE 5/2*2, 5*3/2 +LANGUAGE 1+2*3, (1+2)*3 +LANGUAGE 100/12/5*5, 1+1+1+1*4 +LANGUAGE 9/(1+3), (4+5)/4 LANGUAGE -1, --1 LANGUAGE ----1, -----1 LANGUAGE ~1, ~~1 diff --git a/llvm/test/tools/llvm-rc/Inputs/tokens.rc b/llvm/test/tools/llvm-rc/Inputs/tokens.rc index 6a781202a7e37..20f77912477d9 100644 --- a/llvm/test/tools/llvm-rc/Inputs/tokens.rc +++ b/llvm/test/tools/llvm-rc/Inputs/tokens.rc @@ -1,4 +1,5 @@ 1 + 2 - 3214L & 0x120894 032173 2|&~+(-7){0xabcdef 0xABCDEFl} Begin End +1*3/4 He11o LLVM identifier-with-dashes diff --git a/llvm/test/tools/llvm-rc/parser-expr.test b/llvm/test/tools/llvm-rc/parser-expr.test index ed6796529fdfa..14a299c9e3e96 100644 --- a/llvm/test/tools/llvm-rc/parser-expr.test +++ b/llvm/test/tools/llvm-rc/parser-expr.test @@ -7,6 +7,11 @@ ; CHECK-NEXT: Language: 2, Sublanguage: 4 ; CHECK-NEXT: Language: 3, Sublanguage: 5 ; CHECK-NEXT: Language: 2, Sublanguage: 0 +; CHECK-NEXT: Language: 2, Sublanguage: 21 +; CHECK-NEXT: Language: 4, Sublanguage: 7 +; CHECK-NEXT: Language: 7, Sublanguage: 9 +; CHECK-NEXT: Language: 5, Sublanguage: 7 +; CHECK-NEXT: Language: 2, Sublanguage: 2 ; CHECK-NEXT: Language: 4294967295, Sublanguage: 1 ; CHECK-NEXT: Language: 1, Sublanguage: 4294967295 ; CHECK-NEXT: Language: 4294967294, Sublanguage: 1 diff --git a/llvm/test/tools/llvm-rc/tokenizer.test b/llvm/test/tools/llvm-rc/tokenizer.test index 8486f8bd78690..3062e2bf64629 100644 --- a/llvm/test/tools/llvm-rc/tokenizer.test +++ b/llvm/test/tools/llvm-rc/tokenizer.test @@ -25,6 +25,11 @@ ; CHECK-NEXT: BlockEnd: } ; CHECK-NEXT: BlockBegin: Begin ; CHECK-NEXT: BlockEnd: End +; CHECK-NEXT: Int: 1; int value = 1 +; CHECK-NEXT: Asterisk: * +; CHECK-NEXT: Int: 3; int value = 3 +; CHECK-NEXT: Slash: / +; CHECK-NEXT: Int: 4; int value = 4 ; CHECK-NEXT: Identifier: He11o ; CHECK-NEXT: Identifier: LLVM ; CHECK-NEXT: Identifier: identifier-with-dashes diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.cpp b/llvm/tools/llvm-rc/ResourceScriptParser.cpp index 69798152c1f25..e4efc83c933b4 100644 --- a/llvm/tools/llvm-rc/ResourceScriptParser.cpp +++ b/llvm/tools/llvm-rc/ResourceScriptParser.cpp @@ -132,12 +132,13 @@ void RCParser::consume() { // // The following grammar is used to parse the expressions Exp1: // Exp1 ::= Exp2 || Exp1 + Exp2 || Exp1 - Exp2 || Exp1 | Exp2 || Exp1 & Exp2 -// Exp2 ::= -Exp2 || ~Exp2 || not Expr2 || Int || (Exp1). -// (More conveniently, Exp1 is a non-empty sequence of Exp2 expressions, -// separated by binary operators.) +// Exp2 ::= Exp3 || Exp3 * Exp3 || Exp3 / Exp3 +// Exp3 ::= -Exp3 || ~Exp3 || not Expr3 || Int || (Exp1) +// (More conveniently, Exp1 and Exp2 are non-empty sequences of Exp3 +// expressions, separated by binary operators.) // -// Expressions of type Exp1 are read by parseIntExpr1(Inner) method, while Exp2 -// is read by parseIntExpr2(). +// Expressions of type Exp1 are read by parseIntExpr1(Inner) method, Exp2 +// is read by parseIntExpr2() and Exp3 is read by parseIntExpr3(). // // The original Microsoft tool handles multiple unary operators incorrectly. // For example, in 16-bit little-endian integers: @@ -158,7 +159,7 @@ Expected RCParser::parseIntExpr1() { ASSIGN_OR_RETURN(FirstResult, parseIntExpr2()); IntWithNotMask Result = *FirstResult; - while (!isEof() && look().isBinaryOp()) { + while (!isEof() && look().isLowPrecedenceBinaryOp()) { auto OpToken = read(); ASSIGN_OR_RETURN(NextResult, parseIntExpr2()); @@ -180,7 +181,7 @@ Expected RCParser::parseIntExpr1() { break; default: - llvm_unreachable("Already processed all binary ops."); + llvm_unreachable("Already processed all low precedence binary ops."); } } @@ -188,7 +189,33 @@ Expected RCParser::parseIntExpr1() { } Expected RCParser::parseIntExpr2() { - // Exp2 ::= -Exp2 || ~Exp2 || not Expr2 || Int || (Exp1). + // Exp2 ::= Exp3 || Exp3 * Exp3 || Exp3 / Exp3. + ASSIGN_OR_RETURN(FirstResult, parseIntExpr3()); + IntWithNotMask Result = *FirstResult; + + while (!isEof() && look().isHighPrecedenceBinaryOp()) { + auto OpToken = read(); + ASSIGN_OR_RETURN(NextResult, parseIntExpr3()); + + switch (OpToken.kind()) { + case Kind::Asterisk: + Result *= *NextResult; + break; + + case Kind::Slash: + Result /= *NextResult; + break; + + default: + llvm_unreachable("Already processed all high precedence binary ops."); + } + } + + return Result; +} + +Expected RCParser::parseIntExpr3() { + // Exp3 ::= -Exp3 || ~Exp3 || not Expr3 || Int || (Exp1). static const char ErrorMsg[] = "'-', '~', integer or '('"; if (isEof()) @@ -197,13 +224,13 @@ Expected RCParser::parseIntExpr2() { switch (look().kind()) { case Kind::Minus: { consume(); - ASSIGN_OR_RETURN(Result, parseIntExpr2()); + ASSIGN_OR_RETURN(Result, parseIntExpr3()); return -(*Result); } case Kind::Tilde: { consume(); - ASSIGN_OR_RETURN(Result, parseIntExpr2()); + ASSIGN_OR_RETURN(Result, parseIntExpr3()); return ~(*Result); } @@ -220,7 +247,7 @@ Expected RCParser::parseIntExpr2() { case Kind::Identifier: { if (!read().value().equals_insensitive("not")) return getExpectedError(ErrorMsg, true); - ASSIGN_OR_RETURN(Result, parseIntExpr2()); + ASSIGN_OR_RETURN(Result, parseIntExpr3()); return IntWithNotMask(0, (*Result).getValue()); } diff --git a/llvm/tools/llvm-rc/ResourceScriptParser.h b/llvm/tools/llvm-rc/ResourceScriptParser.h index aa7f847187c49..1e7618c84142e 100644 --- a/llvm/tools/llvm-rc/ResourceScriptParser.h +++ b/llvm/tools/llvm-rc/ResourceScriptParser.h @@ -88,6 +88,7 @@ class RCParser { // Helper integer expression parsing methods. Expected parseIntExpr1(); Expected parseIntExpr2(); + Expected parseIntExpr3(); // Advance the state by one, discarding the current token. // If the discarded token had an incorrect type, fail. diff --git a/llvm/tools/llvm-rc/ResourceScriptStmt.h b/llvm/tools/llvm-rc/ResourceScriptStmt.h index 8f099202c0b47..a81e384fda365 100644 --- a/llvm/tools/llvm-rc/ResourceScriptStmt.h +++ b/llvm/tools/llvm-rc/ResourceScriptStmt.h @@ -49,6 +49,16 @@ class RCInt { return *this; } + RCInt &operator*=(const RCInt &Rhs) { + std::tie(Val, Long) = std::make_pair(Val * Rhs.Val, Long | Rhs.Long); + return *this; + } + + RCInt &operator/=(const RCInt &Rhs) { + std::tie(Val, Long) = std::make_pair(Val / Rhs.Val, Long | Rhs.Long); + return *this; + } + RCInt &operator|=(const RCInt &Rhs) { std::tie(Val, Long) = std::make_pair(Val | Rhs.Val, Long | Rhs.Long); return *this; @@ -98,6 +108,20 @@ class IntWithNotMask { return *this; } + IntWithNotMask &operator*=(const IntWithNotMask &Rhs) { + Value &= ~Rhs.NotMask; + Value *= Rhs.Value; + NotMask |= Rhs.NotMask; + return *this; + } + + IntWithNotMask &operator/=(const IntWithNotMask &Rhs) { + Value &= ~Rhs.NotMask; + Value /= Rhs.Value; + NotMask |= Rhs.NotMask; + return *this; + } + IntWithNotMask &operator|=(const IntWithNotMask &Rhs) { Value &= ~Rhs.NotMask; Value |= Rhs.Value; diff --git a/llvm/tools/llvm-rc/ResourceScriptToken.cpp b/llvm/tools/llvm-rc/ResourceScriptToken.cpp index aad1060c4a381..0070037e63e6a 100644 --- a/llvm/tools/llvm-rc/ResourceScriptToken.cpp +++ b/llvm/tools/llvm-rc/ResourceScriptToken.cpp @@ -64,7 +64,7 @@ StringRef RCToken::value() const { return TokenValue; } Kind RCToken::kind() const { return TokenKind; } -bool RCToken::isBinaryOp() const { +bool RCToken::isLowPrecedenceBinaryOp() const { switch (TokenKind) { case Kind::Plus: case Kind::Minus: @@ -76,6 +76,16 @@ bool RCToken::isBinaryOp() const { } } +bool RCToken::isHighPrecedenceBinaryOp() const { + switch (TokenKind) { + case Kind::Asterisk: + case Kind::Slash: + return true; + default: + return false; + } +} + static Error getStringError(const Twine &message) { return make_error("Error parsing file: " + message, inconvertibleErrorCode()); diff --git a/llvm/tools/llvm-rc/ResourceScriptToken.h b/llvm/tools/llvm-rc/ResourceScriptToken.h index 29f7502f89efd..3dcdfafd2d576 100644 --- a/llvm/tools/llvm-rc/ResourceScriptToken.h +++ b/llvm/tools/llvm-rc/ResourceScriptToken.h @@ -56,8 +56,11 @@ class RCToken { StringRef value() const; Kind kind() const; - // Check if a token describes a binary operator. - bool isBinaryOp() const; + // Check if a token describes a low precedence binary operator. + bool isLowPrecedenceBinaryOp() const; + + // Check if a token describes a high precedence binary operator. + bool isHighPrecedenceBinaryOp() const; private: Kind TokenKind; diff --git a/llvm/tools/llvm-rc/ResourceScriptTokenList.def b/llvm/tools/llvm-rc/ResourceScriptTokenList.def index a61a96461f0fb..6ee13b2815d35 100644 --- a/llvm/tools/llvm-rc/ResourceScriptTokenList.def +++ b/llvm/tools/llvm-rc/ResourceScriptTokenList.def @@ -29,6 +29,8 @@ SHORT_TOKEN(BlockEnd, '}') // End of the block; can also be END. SHORT_TOKEN(Comma, ',') // Comma - resource arguments separator. SHORT_TOKEN(Plus, '+') // Addition operator. SHORT_TOKEN(Minus, '-') // Subtraction operator. +SHORT_TOKEN(Asterisk, '*') // Multiplication operator. +SHORT_TOKEN(Slash, '/') // Division operator. SHORT_TOKEN(Pipe, '|') // Bitwise-OR operator. SHORT_TOKEN(Amp, '&') // Bitwise-AND operator. SHORT_TOKEN(Tilde, '~') // Bitwise-NOT operator. From 62b3e89afc54a118d597a27185f6915a68e408a0 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 10 Jun 2025 21:37:03 +0100 Subject: [PATCH 003/851] [LV] Remove unused LoopBypassBlocks from ILV (NFC). After recent refactorings to move parts of skeleton creation LoopBypassBlocks isn't used any more. Remove it. --- .../lib/Transforms/Vectorize/LoopVectorize.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 333e50ee98418..427c1460fcfc9 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -611,9 +611,6 @@ class InnerLoopVectorizer { /// Middle Block between the vector and the scalar. BasicBlock *LoopMiddleBlock = nullptr; - /// A list of all bypass blocks. The first block is the entry of the loop. - SmallVector LoopBypassBlocks; - /// Trip count of the original loop. Value *TripCount = nullptr; @@ -2445,7 +2442,6 @@ void InnerLoopVectorizer::emitIterationCountCheck(BasicBlock *Bypass) { if (hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) setBranchWeights(BI, MinItersBypassWeights, /*IsExpected=*/false); ReplaceInstWithInst(TCCheckBlock->getTerminator(), &BI); - LoopBypassBlocks.push_back(TCCheckBlock); assert(cast(Plan.getEntry())->getIRBasicBlock() == TCCheckBlock && @@ -2461,9 +2457,6 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { assert((!Cost->OptForSize || Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled) && "Cannot SCEV check stride or overflow when optimizing for size"); - assert(!LoopBypassBlocks.empty() && - "Should already be a bypass block due to iteration count check"); - LoopBypassBlocks.push_back(SCEVCheckBlock); AddedSafetyChecks = true; introduceCheckBlockInVPlan(SCEVCheckBlock); @@ -2499,7 +2492,6 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { }); } - LoopBypassBlocks.push_back(MemCheckBlock); AddedSafetyChecks = true; @@ -7557,8 +7549,6 @@ EpilogueVectorizerMainLoop::emitIterationCountCheck(BasicBlock *Bypass, nullptr, "vector.ph"); if (ForEpilogue) { - LoopBypassBlocks.push_back(TCCheckBlock); - // Save the trip count so we don't have to regenerate it in the // vec.epilog.iter.check. This is safe to do because the trip count // generated here dominates the vector epilog iter check. @@ -7619,13 +7609,6 @@ EpilogueVectorizerEpilogueLoop::createEpilogueVectorizedLoopSkeleton() { DT->changeImmediateDominator(LoopScalarPreHeader, EPI.EpilogueIterationCountCheck); - // Keep track of bypass blocks, as they feed start values to the induction and - // reduction phis in the scalar loop preheader. - if (EPI.SCEVSafetyCheck) - LoopBypassBlocks.push_back(EPI.SCEVSafetyCheck); - if (EPI.MemSafetyCheck) - LoopBypassBlocks.push_back(EPI.MemSafetyCheck); - LoopBypassBlocks.push_back(EPI.EpilogueIterationCountCheck); // The vec.epilog.iter.check block may contain Phi nodes from inductions or // reductions which merge control-flow from the latch block and the middle @@ -7696,7 +7679,6 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck( setBranchWeights(BI, Weights, /*IsExpected=*/false); } ReplaceInstWithInst(Insert->getTerminator(), &BI); - LoopBypassBlocks.push_back(Insert); // A new entry block has been created for the epilogue VPlan. Hook it in, as // otherwise we would try to modify the entry to the main vector loop. From 830a74092adafa425db05e1c5120d3294f874777 Mon Sep 17 00:00:00 2001 From: Tomohiro Kashiwada Date: Wed, 11 Jun 2025 05:42:36 +0900 Subject: [PATCH 004/851] [Clang] [Cygwin] va_list must be treated like normal Windows (#143115) Handling of va_list on Cygwin environment must be matched to normal Windows environment. The existing test `test/CodeGen/ms_abi.c` seems relevant, but it contains `__attribute__((sysv_abi))`, which is not supported on Cygwin. The new test is based on the `__attribute__((ms_abi))` portion of that test. --------- Co-authored-by: jeremyd2019 --- clang/lib/Basic/Targets/X86.h | 4 +++ clang/test/CodeGen/X86/cygwin-varargs.c | 35 +++++++++++++++++++++++++ 2 files changed, 39 insertions(+) create mode 100644 clang/test/CodeGen/X86/cygwin-varargs.c diff --git a/clang/lib/Basic/Targets/X86.h b/clang/lib/Basic/Targets/X86.h index 6f8a2365be256..ecb31ffa4750f 100644 --- a/clang/lib/Basic/Targets/X86.h +++ b/clang/lib/Basic/Targets/X86.h @@ -997,6 +997,10 @@ class LLVM_LIBRARY_VISIBILITY CygwinX86_64TargetInfo : public X86_64TargetInfo { if (Opts.CPlusPlus) Builder.defineMacro("_GNU_SOURCE"); } + + BuiltinVaListKind getBuiltinVaListKind() const override { + return TargetInfo::CharPtrBuiltinVaList; + } }; class LLVM_LIBRARY_VISIBILITY DarwinX86_64TargetInfo diff --git a/clang/test/CodeGen/X86/cygwin-varargs.c b/clang/test/CodeGen/X86/cygwin-varargs.c new file mode 100644 index 0000000000000..4eea7d64bcb35 --- /dev/null +++ b/clang/test/CodeGen/X86/cygwin-varargs.c @@ -0,0 +1,35 @@ +// RUN: %clang_cc1 -triple x86_64-windows-gnu -emit-llvm < %s | FileCheck %s +// RUN: %clang_cc1 -triple x86_64-pc-cygwin -emit-llvm < %s | FileCheck %s + +struct foo { + int x; + float y; + char z; +}; +// CHECK: %[[STRUCT_FOO:.*]] = type { i32, float, i8 } + +void f(int a, ...) { + // CHECK-LABEL: define dso_local void @f + __builtin_va_list ap; + __builtin_va_start(ap, a); + // CHECK: %[[AP:.*]] = alloca ptr + // CHECK: call void @llvm.va_start + int b = __builtin_va_arg(ap, int); + // CHECK: %[[AP_CUR:.*]] = load ptr, ptr %[[AP]] + // CHECK-NEXT: %[[AP_NEXT:.*]] = getelementptr inbounds i8, ptr %[[AP_CUR]], i64 8 + // CHECK-NEXT: store ptr %[[AP_NEXT]], ptr %[[AP]] + double _Complex c = __builtin_va_arg(ap, double _Complex); + // CHECK: %[[AP_CUR2:.*]] = load ptr, ptr %[[AP]] + // CHECK-NEXT: %[[AP_NEXT2:.*]] = getelementptr inbounds i8, ptr %[[AP_CUR2]], i64 8 + // CHECK-NEXT: store ptr %[[AP_NEXT2]], ptr %[[AP]] + // CHECK-NEXT: load ptr, ptr %[[AP_CUR2]] + struct foo d = __builtin_va_arg(ap, struct foo); + // CHECK: %[[AP_CUR3:.*]] = load ptr, ptr %[[AP]] + // CHECK-NEXT: %[[AP_NEXT3:.*]] = getelementptr inbounds i8, ptr %[[AP_CUR3]], i64 8 + // CHECK-NEXT: store ptr %[[AP_NEXT3]], ptr %[[AP]] + __builtin_va_list ap2; + __builtin_va_copy(ap2, ap); + // CHECK: call void @llvm.va_copy + __builtin_va_end(ap); + // CHECK: call void @llvm.va_end +} From 13ccce28776d8ad27b0c6a92b5a452d62da05663 Mon Sep 17 00:00:00 2001 From: Sumanth Gundapaneni Date: Tue, 10 Jun 2025 15:46:27 -0500 Subject: [PATCH 005/851] [SeparateConstOffsetFromGEP] Decompose constant xor operand if possible (#135788) Try to transform XOR(A, B+C) in to XOR(A,C) + B where XOR(A,C) becomes the base for memory operations. This transformation is true under the following conditions Check 1 - B and C are disjoint. Check 2 - XOR(A,C) and B are disjoint. This transformation is beneficial particularly for GEPs because Disjoint OR operations often map better to addressing modes than XOR. This can enable further optimizations in the GEP offset folding pipeline --- .../Scalar/SeparateConstOffsetFromGEP.cpp | 193 +++++++++++++++++ .../AMDGPU/xor-to-or-disjoint.ll | 204 ++++++++++++++++++ 2 files changed, 397 insertions(+) create mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 320b79203c0b3..6fae9f1dd2404 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -174,6 +174,7 @@ #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -190,6 +191,7 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" +#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include #include @@ -198,6 +200,8 @@ using namespace llvm; using namespace llvm::PatternMatch; +#define DEBUG_TYPE "separate-offset-gep" + static cl::opt DisableSeparateConstOffsetFromGEP( "disable-separate-const-offset-from-gep", cl::init(false), cl::desc("Do not separate the constant offset from a GEP instruction"), @@ -488,6 +492,42 @@ class SeparateConstOffsetFromGEP { DenseMap> DominatingSubs; }; +/// A helper class that aims to convert xor operations into or operations when +/// their operands are disjoint and the result is used in a GEP's index. This +/// can then enable further GEP optimizations by effectively turning BaseVal | +/// Const into BaseVal + Const when they are disjoint, which +/// SeparateConstOffsetFromGEP can then process. This is a common pattern that +/// sets up a grid of memory accesses across a wave where each thread acesses +/// data at various offsets. +class XorToOrDisjointTransformer { +public: + XorToOrDisjointTransformer(Function &F, DominatorTree &DT, + const DataLayout &DL) + : F(F), DT(DT), DL(DL) {} + + bool run(); + +private: + Function &F; + DominatorTree &DT; + const DataLayout &DL; + /// Maps a common operand to all Xor instructions + using XorOpList = SmallVector, 8>; + using XorBaseValInst = DenseMap; + XorBaseValInst XorGroups; + + /// Checks if the given value has at least one GetElementPtr user + static bool hasGEPUser(const Value *V); + + /// Helper function to check if BaseXor dominates all XORs in the group + bool dominatesAllXors(BinaryOperator *BaseXor, const XorOpList &XorsInGroup); + + /// Processes a group of XOR instructions that share the same non-constant + /// base operand. Returns true if this group's processing modified the + /// function. + bool processXorGroup(Instruction *OriginalBaseInst, XorOpList &XorsInGroup); +}; + } // end anonymous namespace char SeparateConstOffsetFromGEPLegacyPass::ID = 0; @@ -1223,6 +1263,154 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { return true; } +// Helper function to check if an instruction has at least one GEP user +bool XorToOrDisjointTransformer::hasGEPUser(const Value *V) { + return llvm::any_of(V->users(), [](const User *U) { + return isa(U); + }); +} + +bool XorToOrDisjointTransformer::dominatesAllXors( + BinaryOperator *BaseXor, const XorOpList &XorsInGroup) { + return llvm::all_of(XorsInGroup, [&](const auto &XorEntry) { + BinaryOperator *XorInst = XorEntry.first; + // Do not evaluate the BaseXor, otherwise we end up cloning it. + return XorInst == BaseXor || DT.dominates(BaseXor, XorInst); + }); +} + +bool XorToOrDisjointTransformer::processXorGroup(Instruction *OriginalBaseInst, + XorOpList &XorsInGroup) { + bool Changed = false; + if (XorsInGroup.size() <= 1) + return false; + + // Sort XorsInGroup by the constant offset value in increasing order. + llvm::sort(XorsInGroup, [](const auto &A, const auto &B) { + return A.second.slt(B.second); + }); + + // Dominance check + // The "base" XOR for dominance purposes is the one with the smallest + // constant. + BinaryOperator *XorWithSmallConst = XorsInGroup[0].first; + + if (!dominatesAllXors(XorWithSmallConst, XorsInGroup)) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE + << ": Cloning and inserting XOR with smallest constant (" + << *XorWithSmallConst + << ") as it does not dominate all other XORs" + << " in function " << F.getName() << "\n"); + + BinaryOperator *ClonedXor = + cast(XorWithSmallConst->clone()); + ClonedXor->setName(XorWithSmallConst->getName() + ".dom_clone"); + ClonedXor->insertAfter(OriginalBaseInst); + LLVM_DEBUG(dbgs() << " Cloned Inst: " << *ClonedXor << "\n"); + Changed = true; + XorWithSmallConst = ClonedXor; + } + + SmallVector InstructionsToErase; + const APInt SmallestConst = + cast(XorWithSmallConst->getOperand(1))->getValue(); + + // Main transformation loop: Iterate over the original XORs in the sorted + // group. + for (const auto &XorEntry : XorsInGroup) { + BinaryOperator *XorInst = XorEntry.first; // Original XOR instruction + const APInt ConstOffsetVal = XorEntry.second; + + // Do not process the one with smallest constant as it is the base. + if (XorInst == XorWithSmallConst) + continue; + + // Disjointness Check 1 + APInt NewConstVal = ConstOffsetVal - SmallestConst; + if ((NewConstVal & SmallestConst) != 0) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Cannot transform XOR in function " + << F.getName() << ":\n" + << " New Const: " << NewConstVal + << " Smallest Const: " << SmallestConst + << " are not disjoint \n"); + continue; + } + + // Disjointness Check 2 + if (MaskedValueIsZero(XorWithSmallConst, NewConstVal, SimplifyQuery(DL), + 0)) { + LLVM_DEBUG(dbgs() << DEBUG_TYPE + << ": Transforming XOR to OR (disjoint) in function " + << F.getName() << ":\n" + << " Xor: " << *XorInst << "\n" + << " Base Val: " << *XorWithSmallConst << "\n" + << " New Const: " << NewConstVal << "\n"); + + auto *NewOrInst = BinaryOperator::CreateDisjointOr( + XorWithSmallConst, + ConstantInt::get(OriginalBaseInst->getType(), NewConstVal), + XorInst->getName() + ".or_disjoint", XorInst->getIterator()); + + NewOrInst->copyMetadata(*XorInst); + XorInst->replaceAllUsesWith(NewOrInst); + LLVM_DEBUG(dbgs() << " New Inst: " << *NewOrInst << "\n"); + InstructionsToErase.push_back(XorInst); // Mark original XOR for deletion + + Changed = true; + } else { + LLVM_DEBUG( + dbgs() << DEBUG_TYPE + << ": Cannot transform XOR (not proven disjoint) in function " + << F.getName() << ":\n" + << " Xor: " << *XorInst << "\n" + << " Base Val: " << *XorWithSmallConst << "\n" + << " New Const: " << NewConstVal << "\n"); + } + } + + for (Instruction *I : InstructionsToErase) + I->eraseFromParent(); + + return Changed; +} + +// Try to transform XOR(A, B+C) in to XOR(A,C) + B where XOR(A,C) becomes +// the base for memory operations. This transformation is true under the +// following conditions +// Check 1 - B and C are disjoint. +// Check 2 - XOR(A,C) and B are disjoint. +// +// This transformation is beneficial particularly for GEPs because: +// 1. OR operations often map better to addressing modes than XOR +// 2. Disjoint OR operations preserve the semantics of the original XOR +// 3. This can enable further optimizations in the GEP offset folding pipeline +bool XorToOrDisjointTransformer::run() { + bool Changed = false; + + // Collect all candidate XORs + for (Instruction &I : instructions(F)) { + Instruction *Op0 = nullptr; + ConstantInt *C1 = nullptr; + BinaryOperator *MatchedXorOp = nullptr; + + // Attempt to match the instruction 'I' as XOR operation. + if (match(&I, m_CombineAnd(m_Xor(m_Instruction(Op0), m_ConstantInt(C1)), + m_BinOp(MatchedXorOp))) && + hasGEPUser(MatchedXorOp)) + XorGroups[Op0].emplace_back(MatchedXorOp, C1->getValue()); + } + + if (XorGroups.empty()) + return false; + + // Process each group of XORs + for (auto &[OriginalBaseInst, XorsInGroup] : XorGroups) + if (processXorGroup(OriginalBaseInst, XorsInGroup)) + Changed = true; + + return Changed; +} + bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) { if (skipFunction(F)) return false; @@ -1242,6 +1430,11 @@ bool SeparateConstOffsetFromGEP::run(Function &F) { DL = &F.getDataLayout(); bool Changed = false; + + // Decompose xor in to "or disjoint" if possible. + XorToOrDisjointTransformer XorTransformer(F, *DT, *DL); + Changed |= XorTransformer.run(); + for (BasicBlock &B : F) { if (!DT->isReachableFromEntry(&B)) continue; diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll new file mode 100644 index 0000000000000..825227292fe14 --- /dev/null +++ b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll @@ -0,0 +1,204 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \ +; RUN: -S < %s | FileCheck %s + + +; Test a simple case of xor to or disjoint transformation +define half @test_basic_transformation(ptr %ptr, i64 %input) { +; CHECK-LABEL: define half @test_basic_transformation( +; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[BASE:%.*]] = and i64 [[INPUT]], -8192 +; CHECK-NEXT: [[ADDR1:%.*]] = xor i64 [[BASE]], 32 +; CHECK-NEXT: [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048 +; CHECK-NEXT: [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 4096 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]] +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]] +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]] +; CHECK-NEXT: [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2 +; CHECK-NEXT: [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2 +; CHECK-NEXT: [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2 +; CHECK-NEXT: [[VAL1_F:%.*]] = fpext half [[VAL1]] to float +; CHECK-NEXT: [[VAL2_F:%.*]] = fpext half [[VAL2]] to float +; CHECK-NEXT: [[VAL3_F:%.*]] = fpext half [[VAL3]] to float +; CHECK-NEXT: [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]] +; CHECK-NEXT: [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]] +; CHECK-NEXT: [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half +; CHECK-NEXT: ret half [[RESULT_H]] +; +entry: + %base = and i64 %input, -8192 ; Clear low bits + %addr1 = xor i64 %base, 32 + %addr2 = xor i64 %base, 2080 + %addr3 = xor i64 %base, 4128 + %gep1 = getelementptr i8, ptr %ptr, i64 %addr1 + %gep2 = getelementptr i8, ptr %ptr, i64 %addr2 + %gep3 = getelementptr i8, ptr %ptr, i64 %addr3 + %val1 = load half, ptr %gep1 + %val2 = load half, ptr %gep2 + %val3 = load half, ptr %gep3 + %val1.f = fpext half %val1 to float + %val2.f = fpext half %val2 to float + %val3.f = fpext half %val3 to float + %sum1.f = fadd float %val1.f, %val2.f + %sum_total.f = fadd float %sum1.f, %val3.f + %result.h = fptrunc float %sum_total.f to half + ret half %result.h +} + + +; Test the decreasing order of offset xor to or disjoint transformation +define half @test_descending_offset_transformation(ptr %ptr, i64 %input) { +; CHECK-LABEL: define half @test_descending_offset_transformation( +; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[BASE:%.*]] = and i64 [[INPUT]], -8192 +; CHECK-NEXT: [[ADDR3_DOM_CLONE:%.*]] = xor i64 [[BASE]], 32 +; CHECK-NEXT: [[ADDR1_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 4096 +; CHECK-NEXT: [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 2048 +; CHECK-NEXT: [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 0 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1_OR_DISJOINT]] +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]] +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]] +; CHECK-NEXT: [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2 +; CHECK-NEXT: [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2 +; CHECK-NEXT: [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2 +; CHECK-NEXT: [[VAL1_F:%.*]] = fpext half [[VAL1]] to float +; CHECK-NEXT: [[VAL2_F:%.*]] = fpext half [[VAL2]] to float +; CHECK-NEXT: [[VAL3_F:%.*]] = fpext half [[VAL3]] to float +; CHECK-NEXT: [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]] +; CHECK-NEXT: [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]] +; CHECK-NEXT: [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half +; CHECK-NEXT: ret half [[RESULT_H]] +; +entry: + %base = and i64 %input, -8192 ; Clear low bits + %addr1 = xor i64 %base, 4128 + %addr2 = xor i64 %base, 2080 + %addr3 = xor i64 %base, 32 + %gep1 = getelementptr i8, ptr %ptr, i64 %addr1 + %gep2 = getelementptr i8, ptr %ptr, i64 %addr2 + %gep3 = getelementptr i8, ptr %ptr, i64 %addr3 + %val1 = load half, ptr %gep1 + %val2 = load half, ptr %gep2 + %val3 = load half, ptr %gep3 + %val1.f = fpext half %val1 to float + %val2.f = fpext half %val2 to float + %val3.f = fpext half %val3 to float + %sum1.f = fadd float %val1.f, %val2.f + %sum_total.f = fadd float %sum1.f, %val3.f + %result.h = fptrunc float %sum_total.f to half + ret half %result.h +} + + +; Test that %addr2 is not transformed to or disjoint. +define half @test_no_transfomation(ptr %ptr, i64 %input) { +; CHECK-LABEL: define half @test_no_transfomation( +; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[BASE:%.*]] = and i64 [[INPUT]], -8192 +; CHECK-NEXT: [[ADDR1:%.*]] = xor i64 [[BASE]], 32 +; CHECK-NEXT: [[ADDR2:%.*]] = xor i64 [[BASE]], 64 +; CHECK-NEXT: [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]] +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2]] +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]] +; CHECK-NEXT: [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2 +; CHECK-NEXT: [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2 +; CHECK-NEXT: [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2 +; CHECK-NEXT: [[VAL1_F:%.*]] = fpext half [[VAL1]] to float +; CHECK-NEXT: [[VAL2_F:%.*]] = fpext half [[VAL2]] to float +; CHECK-NEXT: [[VAL3_F:%.*]] = fpext half [[VAL3]] to float +; CHECK-NEXT: [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]] +; CHECK-NEXT: [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]] +; CHECK-NEXT: [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half +; CHECK-NEXT: ret half [[RESULT_H]] +; +entry: + %base = and i64 %input, -8192 ; Clear low bits + %addr1 = xor i64 %base, 32 + %addr2 = xor i64 %base, 64 ; Should not be transformed + %addr3 = xor i64 %base, 2080 + %gep1 = getelementptr i8, ptr %ptr, i64 %addr1 + %gep2 = getelementptr i8, ptr %ptr, i64 %addr2 + %gep3 = getelementptr i8, ptr %ptr, i64 %addr3 + %val1 = load half, ptr %gep1 + %val2 = load half, ptr %gep2 + %val3 = load half, ptr %gep3 + %val1.f = fpext half %val1 to float + %val2.f = fpext half %val2 to float + %val3.f = fpext half %val3 to float + %sum1.f = fadd float %val1.f, %val2.f + %sum_total.f = fadd float %sum1.f, %val3.f + %result.h = fptrunc float %sum_total.f to half + ret half %result.h +} + + +; Test case with xor instructions in different basic blocks +define half @test_dom_tree(ptr %ptr, i64 %input, i1 %cond) { +; CHECK-LABEL: define half @test_dom_tree( +; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]], i1 [[COND:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[BASE:%.*]] = and i64 [[INPUT]], -8192 +; CHECK-NEXT: [[ADDR1:%.*]] = xor i64 [[BASE]], 16 +; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]] +; CHECK-NEXT: [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2 +; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] +; CHECK: [[THEN]]: +; CHECK-NEXT: [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 32 +; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]] +; CHECK-NEXT: [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2 +; CHECK-NEXT: br label %[[MERGE:.*]] +; CHECK: [[ELSE]]: +; CHECK-NEXT: [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 96 +; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]] +; CHECK-NEXT: [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2 +; CHECK-NEXT: br label %[[MERGE]] +; CHECK: [[MERGE]]: +; CHECK-NEXT: [[VAL_FROM_BRANCH:%.*]] = phi half [ [[VAL2]], %[[THEN]] ], [ [[VAL3]], %[[ELSE]] ] +; CHECK-NEXT: [[ADDR4_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 224 +; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR4_OR_DISJOINT]] +; CHECK-NEXT: [[VAL4:%.*]] = load half, ptr [[GEP4]], align 2 +; CHECK-NEXT: [[VAL1_F:%.*]] = fpext half [[VAL1]] to float +; CHECK-NEXT: [[VAL_FROM_BRANCH_F:%.*]] = fpext half [[VAL_FROM_BRANCH]] to float +; CHECK-NEXT: [[VAL4_F:%.*]] = fpext half [[VAL4]] to float +; CHECK-NEXT: [[SUM_INTERMEDIATE_F:%.*]] = fadd float [[VAL1_F]], [[VAL_FROM_BRANCH_F]] +; CHECK-NEXT: [[FINAL_SUM_F:%.*]] = fadd float [[SUM_INTERMEDIATE_F]], [[VAL4_F]] +; CHECK-NEXT: [[RESULT_H:%.*]] = fptrunc float [[FINAL_SUM_F]] to half +; CHECK-NEXT: ret half [[RESULT_H]] +; +entry: + %base = and i64 %input, -8192 ; Clear low bits + %addr1 = xor i64 %base,16 + %gep1 = getelementptr i8, ptr %ptr, i64 %addr1 + %val1 = load half, ptr %gep1 + br i1 %cond, label %then, label %else + +then: + %addr2 = xor i64 %base, 48 + %gep2 = getelementptr i8, ptr %ptr, i64 %addr2 + %val2 = load half, ptr %gep2 + br label %merge + +else: + %addr3 = xor i64 %base, 112 + %gep3 = getelementptr i8, ptr %ptr, i64 %addr3 + %val3 = load half, ptr %gep3 + br label %merge + +merge: + %val_from_branch = phi half [ %val2, %then ], [ %val3, %else ] + %addr4 = xor i64 %base, 240 + %gep4 = getelementptr i8, ptr %ptr, i64 %addr4 + %val4 = load half, ptr %gep4 + %val1.f = fpext half %val1 to float + %val_from_branch.f = fpext half %val_from_branch to float + %val4.f = fpext half %val4 to float + %sum_intermediate.f = fadd float %val1.f, %val_from_branch.f + %final_sum.f = fadd float %sum_intermediate.f, %val4.f + %result.h = fptrunc float %final_sum.f to half + ret half %result.h +} + From 0c774682889ae9b1b89cb9d4d796283f205b8a63 Mon Sep 17 00:00:00 2001 From: Amir Ayupov Date: Tue, 10 Jun 2025 14:31:22 -0700 Subject: [PATCH 006/851] [BOLT] Expose external entry count for functions (#141674) Record the number of function invocations from external code - code outside the binary, which may include JIT code and DSOs. Accounting external entry counts improves the fidelity of call graph flow conservation analysis. Test Plan: updated shrinkwrapping.test --- bolt/include/bolt/Core/BinaryFunction.h | 12 ++++++++++++ bolt/include/bolt/Profile/DataReader.h | 3 +++ bolt/include/bolt/Profile/ProfileYAMLMapping.h | 2 ++ bolt/lib/Core/BinaryFunction.cpp | 2 ++ bolt/lib/Passes/ProfileQualityStats.cpp | 3 +++ bolt/lib/Profile/DataAggregator.cpp | 1 + bolt/lib/Profile/DataReader.cpp | 6 ++++++ bolt/lib/Profile/YAMLProfileReader.cpp | 1 + bolt/lib/Profile/YAMLProfileWriter.cpp | 1 + bolt/test/X86/shrinkwrapping.test | 2 ++ 10 files changed, 33 insertions(+) diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 14957cba50174..ca8b786f4ab69 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -388,6 +388,10 @@ class BinaryFunction { /// The profile data for the number of times the function was executed. uint64_t ExecutionCount{COUNT_NO_PROFILE}; + /// Profile data for the number of times this function was entered from + /// external code (DSO, JIT, etc). + uint64_t ExternEntryCount{0}; + /// Profile match ratio. float ProfileMatchRatio{0.0f}; @@ -1877,6 +1881,10 @@ class BinaryFunction { return *this; } + /// Set the profile data for the number of times the function was entered from + /// external code (DSO/JIT). + void setExternEntryCount(uint64_t Count) { ExternEntryCount = Count; } + /// Adjust execution count for the function by a given \p Count. The value /// \p Count will be subtracted from the current function count. /// @@ -1904,6 +1912,10 @@ class BinaryFunction { /// Return COUNT_NO_PROFILE if there's no profile info. uint64_t getExecutionCount() const { return ExecutionCount; } + /// Return the profile information about the number of times the function was + /// entered from external code (DSO/JIT). + uint64_t getExternEntryCount() const { return ExternEntryCount; } + /// Return the raw profile information about the number of branch /// executions corresponding to this function. uint64_t getRawSampleCount() const { return RawSampleCount; } diff --git a/bolt/include/bolt/Profile/DataReader.h b/bolt/include/bolt/Profile/DataReader.h index 3c770fed2598f..6f527ba3931d4 100644 --- a/bolt/include/bolt/Profile/DataReader.h +++ b/bolt/include/bolt/Profile/DataReader.h @@ -97,6 +97,9 @@ struct FuncBranchData { /// Total execution count for the function. int64_t ExecutionCount{0}; + /// Total entry count from external code for the function. + uint64_t ExternEntryCount{0}; + /// Indicate if the data was used. bool Used{false}; diff --git a/bolt/include/bolt/Profile/ProfileYAMLMapping.h b/bolt/include/bolt/Profile/ProfileYAMLMapping.h index a8d9a15311d94..41e2bd1651efd 100644 --- a/bolt/include/bolt/Profile/ProfileYAMLMapping.h +++ b/bolt/include/bolt/Profile/ProfileYAMLMapping.h @@ -206,6 +206,7 @@ struct BinaryFunctionProfile { uint32_t Id{0}; llvm::yaml::Hex64 Hash{0}; uint64_t ExecCount{0}; + uint64_t ExternEntryCount{0}; std::vector Blocks; std::vector InlineTree; bool Used{false}; @@ -218,6 +219,7 @@ template <> struct MappingTraits { YamlIO.mapRequired("fid", BFP.Id); YamlIO.mapRequired("hash", BFP.Hash); YamlIO.mapRequired("exec", BFP.ExecCount); + YamlIO.mapOptional("extern", BFP.ExternEntryCount, 0); YamlIO.mapRequired("nblocks", BFP.NumBasicBlocks); YamlIO.mapOptional("blocks", BFP.Blocks, std::vector()); diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 6d1969f5c6c30..b998d7160aae7 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -471,6 +471,8 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) { OS << "\n Sample Count: " << RawSampleCount; OS << "\n Profile Acc : " << format("%.1f%%", ProfileMatchRatio * 100.0f); } + if (ExternEntryCount) + OS << "\n Extern Entry Count: " << ExternEntryCount; if (opts::PrintDynoStats && !getLayout().block_empty()) { OS << '\n'; diff --git a/bolt/lib/Passes/ProfileQualityStats.cpp b/bolt/lib/Passes/ProfileQualityStats.cpp index dfd74d3dd5719..64cc662c3ab29 100644 --- a/bolt/lib/Passes/ProfileQualityStats.cpp +++ b/bolt/lib/Passes/ProfileQualityStats.cpp @@ -532,6 +532,9 @@ void computeFlowMappings(const BinaryContext &BC, FlowInfo &TotalFlowMap) { std::vector &MaxCountMap = TotalMaxCountMaps[FunctionNum]; std::vector &MinCountMap = TotalMinCountMaps[FunctionNum]; + // Record external entry count into CallGraphIncomingFlows + CallGraphIncomingFlows[FunctionNum] += Function->getExternEntryCount(); + // Update MaxCountMap, MinCountMap, and CallGraphIncomingFlows auto recordCall = [&](const BinaryBasicBlock *SourceBB, const MCSymbol *DestSymbol, uint64_t Count, diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index 4022212bcf1b6..308346e5d02ce 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -2255,6 +2255,7 @@ std::error_code DataAggregator::writeBATYAML(BinaryContext &BC, YamlBF.Id = BF->getFunctionNumber(); YamlBF.Hash = BAT->getBFHash(FuncAddress); YamlBF.ExecCount = BF->getKnownExecutionCount(); + YamlBF.ExternEntryCount = BF->getExternEntryCount(); YamlBF.NumBasicBlocks = BAT->getNumBasicBlocks(FuncAddress); const BoltAddressTranslation::BBHashMapTy &BlockMap = BAT->getBBHashMap(FuncAddress); diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp index c512394f26a3b..afe24216d7f5d 100644 --- a/bolt/lib/Profile/DataReader.cpp +++ b/bolt/lib/Profile/DataReader.cpp @@ -85,6 +85,7 @@ void FuncBranchData::appendFrom(const FuncBranchData &FBD, uint64_t Offset) { } llvm::stable_sort(Data); ExecutionCount += FBD.ExecutionCount; + ExternEntryCount += FBD.ExternEntryCount; for (auto I = FBD.EntryData.begin(), E = FBD.EntryData.end(); I != E; ++I) { assert(I->To.Name == FBD.Name); auto NewElmt = EntryData.insert(EntryData.end(), *I); @@ -269,6 +270,7 @@ Error DataReader::preprocessProfile(BinaryContext &BC) { if (FuncBranchData *FuncData = getBranchDataForNames(Function.getNames())) { setBranchData(Function, FuncData); Function.ExecutionCount = FuncData->ExecutionCount; + Function.ExternEntryCount = FuncData->ExternEntryCount; FuncData->Used = true; } } @@ -419,6 +421,7 @@ void DataReader::matchProfileData(BinaryFunction &BF) { if (fetchProfileForOtherEntryPoints(BF)) { BF.ProfileMatchRatio = evaluateProfileData(BF, *FBD); BF.ExecutionCount = FBD->ExecutionCount; + BF.ExternEntryCount = FBD->ExternEntryCount; BF.RawSampleCount = FBD->getNumExecutedBranches(); } return; @@ -449,6 +452,7 @@ void DataReader::matchProfileData(BinaryFunction &BF) { setBranchData(BF, NewBranchData); NewBranchData->Used = true; BF.ExecutionCount = NewBranchData->ExecutionCount; + BF.ExternEntryCount = NewBranchData->ExternEntryCount; BF.ProfileMatchRatio = 1.0f; break; } @@ -1190,6 +1194,8 @@ std::error_code DataReader::parse() { if (BI.To.IsSymbol && BI.To.Offset == 0) { I = GetOrCreateFuncEntry(BI.To.Name); I->second.ExecutionCount += BI.Branches; + if (!BI.From.IsSymbol) + I->second.ExternEntryCount += BI.Branches; } } diff --git a/bolt/lib/Profile/YAMLProfileReader.cpp b/bolt/lib/Profile/YAMLProfileReader.cpp index 33ce40ac2eeec..086e47b661e10 100644 --- a/bolt/lib/Profile/YAMLProfileReader.cpp +++ b/bolt/lib/Profile/YAMLProfileReader.cpp @@ -176,6 +176,7 @@ bool YAMLProfileReader::parseFunctionProfile( uint64_t FunctionExecutionCount = 0; BF.setExecutionCount(YamlBF.ExecCount); + BF.setExternEntryCount(YamlBF.ExternEntryCount); uint64_t FuncRawBranchCount = 0; for (const yaml::bolt::BinaryBasicBlockProfile &YamlBB : YamlBF.Blocks) diff --git a/bolt/lib/Profile/YAMLProfileWriter.cpp b/bolt/lib/Profile/YAMLProfileWriter.cpp index 0ae67a4d35595..1632aa1c6bfe2 100644 --- a/bolt/lib/Profile/YAMLProfileWriter.cpp +++ b/bolt/lib/Profile/YAMLProfileWriter.cpp @@ -226,6 +226,7 @@ YAMLProfileWriter::convert(const BinaryFunction &BF, bool UseDFS, YamlBF.Hash = BF.getHash(); YamlBF.NumBasicBlocks = BF.size(); YamlBF.ExecCount = BF.getKnownExecutionCount(); + YamlBF.ExternEntryCount = BF.getExternEntryCount(); DenseMap InlineTreeNodeId; if (PseudoProbeDecoder && BF.getGUID()) { std::tie(YamlBF.InlineTree, InlineTreeNodeId) = diff --git a/bolt/test/X86/shrinkwrapping.test b/bolt/test/X86/shrinkwrapping.test index 8581d7e0c0f7b..521b4561b3ba6 100644 --- a/bolt/test/X86/shrinkwrapping.test +++ b/bolt/test/X86/shrinkwrapping.test @@ -8,6 +8,7 @@ REQUIRES: shell RUN: %clangxx %cxxflags -no-pie %S/Inputs/exc4sw.S -o %t.exe -Wl,-q RUN: llvm-bolt %t.exe -o %t --relocs --frame-opt=all \ +RUN: --print-only=main --print-cfg \ RUN: --data=%p/Inputs/exc4sw.fdata --reorder-blocks=cache 2>&1 | \ RUN: FileCheck %s --check-prefix=CHECK-BOLT @@ -19,6 +20,7 @@ RUN: llvm-objdump --dwarf=frames %t | grep -A20 -e \ RUN: `llvm-nm --numeric-sort %t | grep main | tail -n 1 | cut -f1 -d' ' | \ RUN: tail -c9` 2>&1 | FileCheck %s --check-prefix=CHECK-OUTPUT +CHECK-BOLT: Extern Entry Count: 100 CHECK-BOLT: Shrink wrapping moved 2 spills inserting load/stores and 0 spills inserting push/pops CHECK-INPUT: DW_CFA_advance_loc: 2 From 163c67ad3d1bf7af6590930d8f18700d65ad4564 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Jun 2025 14:44:19 -0700 Subject: [PATCH 007/851] [flang][runtime] Replace recursion with iterative work queue (#137727) Recursion, both direct and indirect, prevents accurate stack size calculation at link time for GPU device code. Restructure these recursive (often mutually so) routines in the Fortran runtime with new implementations based on an iterative work queue with suspendable/resumable work tickets: Assign, Initialize, initializeClone, Finalize, and Destroy. Default derived type I/O is also recursive, but already disabled. It can be added to this new framework later if the overall approach succeeds. Note that derived type FINAL subroutine calls, defined assignments, and defined I/O procedures all perform callbacks into user code, which may well reenter the runtime library. This kind of recursion is not handled by this change, although it may be possible to do so in the future using thread-local work queues. The effects of this restructuring on CPU performance are yet to be measured. --- .../include/flang-rt/runtime/environment.h | 3 + flang-rt/include/flang-rt/runtime/stat.h | 10 +- flang-rt/include/flang-rt/runtime/type-info.h | 2 + .../include/flang-rt/runtime/work-queue.h | 548 +++++++++++++++ flang-rt/lib/runtime/CMakeLists.txt | 2 + flang-rt/lib/runtime/assign.cpp | 623 +++++++++++------ flang-rt/lib/runtime/derived.cpp | 517 +++++++------- flang-rt/lib/runtime/descriptor-io.cpp | 651 +++++++++++++++++- flang-rt/lib/runtime/descriptor-io.h | 620 +---------------- flang-rt/lib/runtime/environment.cpp | 4 + flang-rt/lib/runtime/namelist.cpp | 1 + flang-rt/lib/runtime/tools.cpp | 4 +- flang-rt/lib/runtime/type-info.cpp | 6 +- flang-rt/lib/runtime/work-queue.cpp | 161 +++++ flang-rt/unittests/Runtime/ExternalIOTest.cpp | 2 +- flang/docs/Extensions.md | 10 + flang/include/flang/Runtime/assign.h | 2 +- flang/include/flang/Semantics/tools.h | 7 +- flang/lib/Semantics/runtime-type-info.cpp | 4 + flang/lib/Semantics/tools.cpp | 32 + flang/module/__fortran_type_info.f90 | 3 +- flang/test/Lower/volatile-openmp.f90 | 8 +- flang/test/Semantics/typeinfo01.f90 | 30 +- flang/test/Semantics/typeinfo03.f90 | 2 +- flang/test/Semantics/typeinfo04.f90 | 8 +- flang/test/Semantics/typeinfo05.f90 | 4 +- flang/test/Semantics/typeinfo06.f90 | 4 +- flang/test/Semantics/typeinfo07.f90 | 8 +- flang/test/Semantics/typeinfo08.f90 | 2 +- flang/test/Semantics/typeinfo11.f90 | 2 +- flang/test/Semantics/typeinfo12.f90 | 67 ++ 31 files changed, 2227 insertions(+), 1120 deletions(-) create mode 100644 flang-rt/include/flang-rt/runtime/work-queue.h create mode 100644 flang-rt/lib/runtime/work-queue.cpp create mode 100644 flang/test/Semantics/typeinfo12.f90 diff --git a/flang-rt/include/flang-rt/runtime/environment.h b/flang-rt/include/flang-rt/runtime/environment.h index 16258b3bbba9b..e579f6012ce86 100644 --- a/flang-rt/include/flang-rt/runtime/environment.h +++ b/flang-rt/include/flang-rt/runtime/environment.h @@ -64,6 +64,9 @@ struct ExecutionEnvironment { bool defaultUTF8{false}; // DEFAULT_UTF8 bool checkPointerDeallocation{true}; // FORT_CHECK_POINTER_DEALLOCATION + enum InternalDebugging { WorkQueue = 1 }; + int internalDebugging{0}; // FLANG_RT_DEBUG + // CUDA related variables std::size_t cudaStackLimit{0}; // ACC_OFFLOAD_STACK_SIZE bool cudaDeviceIsManaged{false}; // NV_CUDAFOR_DEVICE_IS_MANAGED diff --git a/flang-rt/include/flang-rt/runtime/stat.h b/flang-rt/include/flang-rt/runtime/stat.h index 070d0bf8673fb..dc372de53506a 100644 --- a/flang-rt/include/flang-rt/runtime/stat.h +++ b/flang-rt/include/flang-rt/runtime/stat.h @@ -24,7 +24,7 @@ class Terminator; enum Stat { StatOk = 0, // required to be zero by Fortran - // Interoperable STAT= codes + // Interoperable STAT= codes (>= 11) StatBaseNull = CFI_ERROR_BASE_ADDR_NULL, StatBaseNotNull = CFI_ERROR_BASE_ADDR_NOT_NULL, StatInvalidElemLen = CFI_INVALID_ELEM_LEN, @@ -36,7 +36,7 @@ enum Stat { StatMemAllocation = CFI_ERROR_MEM_ALLOCATION, StatOutOfBounds = CFI_ERROR_OUT_OF_BOUNDS, - // Standard STAT= values + // Standard STAT= values (>= 101) StatFailedImage = FORTRAN_RUNTIME_STAT_FAILED_IMAGE, StatLocked = FORTRAN_RUNTIME_STAT_LOCKED, StatLockedOtherImage = FORTRAN_RUNTIME_STAT_LOCKED_OTHER_IMAGE, @@ -49,10 +49,14 @@ enum Stat { // Additional "processor-defined" STAT= values StatInvalidArgumentNumber = FORTRAN_RUNTIME_STAT_INVALID_ARG_NUMBER, StatMissingArgument = FORTRAN_RUNTIME_STAT_MISSING_ARG, - StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT, + StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT, // -1 StatMoveAllocSameAllocatable = FORTRAN_RUNTIME_STAT_MOVE_ALLOC_SAME_ALLOCATABLE, StatBadPointerDeallocation = FORTRAN_RUNTIME_STAT_BAD_POINTER_DEALLOCATION, + + // Dummy status for work queue continuation, declared here to perhaps + // avoid collisions + StatContinue = 201 }; RT_API_ATTRS const char *StatErrorString(int); diff --git a/flang-rt/include/flang-rt/runtime/type-info.h b/flang-rt/include/flang-rt/runtime/type-info.h index 5e79efde164f2..9bde3adba87f5 100644 --- a/flang-rt/include/flang-rt/runtime/type-info.h +++ b/flang-rt/include/flang-rt/runtime/type-info.h @@ -240,6 +240,7 @@ class DerivedType { RT_API_ATTRS bool noFinalizationNeeded() const { return noFinalizationNeeded_; } + RT_API_ATTRS bool noDefinedAssignment() const { return noDefinedAssignment_; } RT_API_ATTRS std::size_t LenParameters() const { return lenParameterKind().Elements(); @@ -322,6 +323,7 @@ class DerivedType { bool noInitializationNeeded_{false}; bool noDestructionNeeded_{false}; bool noFinalizationNeeded_{false}; + bool noDefinedAssignment_{false}; }; } // namespace Fortran::runtime::typeInfo diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h new file mode 100644 index 0000000000000..878b18373e1d2 --- /dev/null +++ b/flang-rt/include/flang-rt/runtime/work-queue.h @@ -0,0 +1,548 @@ +//===-- include/flang-rt/runtime/work-queue.h -------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +// Internal runtime utilities for work queues that replace the use of recursion +// for better GPU device support. +// +// A work queue comprises a list of tickets. Each ticket class has a Begin() +// member function, which is called once, and a Continue() member function +// that can be called zero or more times. A ticket's execution terminates +// when either of these member functions returns a status other than +// StatContinue. When that status is not StatOk, then the whole queue +// is shut down. +// +// By returning StatContinue from its Continue() member function, +// a ticket suspends its execution so that any nested tickets that it +// may have created can be run to completion. It is the reponsibility +// of each ticket class to maintain resumption information in its state +// and manage its own progress. Most ticket classes inherit from +// class ComponentsOverElements, which implements an outer loop over all +// components of a derived type, and an inner loop over all elements +// of a descriptor, possibly with multiple phases of execution per element. +// +// Tickets are created by WorkQueue::Begin...() member functions. +// There is one of these for each "top level" recursive function in the +// Fortran runtime support library that has been restructured into this +// ticket framework. +// +// When the work queue is running tickets, it always selects the last ticket +// on the list for execution -- "work stack" might have been a more accurate +// name for this framework. This ticket may, while doing its job, create +// new tickets, and since those are pushed after the active one, the first +// such nested ticket will be the next one executed to completion -- i.e., +// the order of nested WorkQueue::Begin...() calls is respected. +// Note that a ticket's Continue() member function won't be called again +// until all nested tickets have run to completion and it is once again +// the last ticket on the queue. +// +// Example for an assignment to a derived type: +// 1. Assign() is called, and its work queue is created. It calls +// WorkQueue::BeginAssign() and then WorkQueue::Run(). +// 2. Run calls AssignTicket::Begin(), which pushes a tickets via +// BeginFinalize() and returns StatContinue. +// 3. FinalizeTicket::Begin() and FinalizeTicket::Continue() are called +// until one of them returns StatOk, which ends the finalization ticket. +// 4. AssignTicket::Continue() is then called; it creates a DerivedAssignTicket +// and then returns StatOk, which ends the ticket. +// 5. At this point, only one ticket remains. DerivedAssignTicket::Begin() +// and ::Continue() are called until they are done (not StatContinue). +// Along the way, it may create nested AssignTickets for components, +// and suspend itself so that they may each run to completion. + +#ifndef FLANG_RT_RUNTIME_WORK_QUEUE_H_ +#define FLANG_RT_RUNTIME_WORK_QUEUE_H_ + +#include "flang-rt/runtime/connection.h" +#include "flang-rt/runtime/descriptor.h" +#include "flang-rt/runtime/stat.h" +#include "flang-rt/runtime/type-info.h" +#include "flang/Common/api-attrs.h" +#include "flang/Runtime/freestanding-tools.h" +#include + +namespace Fortran::runtime::io { +class IoStatementState; +struct NonTbpDefinedIoTable; +} // namespace Fortran::runtime::io + +namespace Fortran::runtime { +class Terminator; +class WorkQueue; + +// Ticket worker base classes + +template class ImmediateTicketRunner { +public: + RT_API_ATTRS explicit ImmediateTicketRunner(TICKET &ticket) + : ticket_{ticket} {} + RT_API_ATTRS int Run(WorkQueue &workQueue) { + int status{ticket_.Begin(workQueue)}; + while (status == StatContinue) { + status = ticket_.Continue(workQueue); + } + return status; + } + +private: + TICKET &ticket_; +}; + +// Base class for ticket workers that operate elementwise over descriptors +class Elementwise { +protected: + RT_API_ATTRS Elementwise( + const Descriptor &instance, const Descriptor *from = nullptr) + : instance_{instance}, from_{from} { + instance_.GetLowerBounds(subscripts_); + if (from_) { + from_->GetLowerBounds(fromSubscripts_); + } + } + RT_API_ATTRS bool IsComplete() const { return elementAt_ >= elements_; } + RT_API_ATTRS void Advance() { + ++elementAt_; + instance_.IncrementSubscripts(subscripts_); + if (from_) { + from_->IncrementSubscripts(fromSubscripts_); + } + } + RT_API_ATTRS void SkipToEnd() { elementAt_ = elements_; } + RT_API_ATTRS void Reset() { + elementAt_ = 0; + instance_.GetLowerBounds(subscripts_); + if (from_) { + from_->GetLowerBounds(fromSubscripts_); + } + } + + const Descriptor &instance_, *from_{nullptr}; + std::size_t elements_{instance_.Elements()}; + std::size_t elementAt_{0}; + SubscriptValue subscripts_[common::maxRank]; + SubscriptValue fromSubscripts_[common::maxRank]; +}; + +// Base class for ticket workers that operate over derived type components. +class Componentwise { +protected: + RT_API_ATTRS Componentwise(const typeInfo::DerivedType &); + RT_API_ATTRS bool IsComplete() const { return componentAt_ >= components_; } + RT_API_ATTRS void Advance() { + ++componentAt_; + GetComponent(); + } + RT_API_ATTRS void SkipToEnd() { + component_ = nullptr; + componentAt_ = components_; + } + RT_API_ATTRS void Reset() { + component_ = nullptr; + componentAt_ = 0; + GetComponent(); + } + RT_API_ATTRS void GetComponent(); + + const typeInfo::DerivedType &derived_; + std::size_t components_{0}, componentAt_{0}; + const typeInfo::Component *component_{nullptr}; + StaticDescriptor componentDescriptor_; +}; + +// Base class for ticket workers that operate over derived type components +// in an outer loop, and elements in an inner loop. +class ComponentsOverElements : protected Componentwise, protected Elementwise { +protected: + RT_API_ATTRS ComponentsOverElements(const Descriptor &instance, + const typeInfo::DerivedType &derived, const Descriptor *from = nullptr) + : Componentwise{derived}, Elementwise{instance, from} { + if (Elementwise::IsComplete()) { + Componentwise::SkipToEnd(); + } + } + RT_API_ATTRS bool IsComplete() const { return Componentwise::IsComplete(); } + RT_API_ATTRS void Advance() { + SkipToNextElement(); + if (Elementwise::IsComplete()) { + Elementwise::Reset(); + Componentwise::Advance(); + } + } + RT_API_ATTRS void SkipToNextElement() { + phase_ = 0; + Elementwise::Advance(); + } + RT_API_ATTRS void SkipToNextComponent() { + phase_ = 0; + Elementwise::Reset(); + Componentwise::Advance(); + } + RT_API_ATTRS void Reset() { + phase_ = 0; + Elementwise::Reset(); + Componentwise::Reset(); + } + + int phase_{0}; +}; + +// Base class for ticket workers that operate over elements in an outer loop, +// type components in an inner loop. +class ElementsOverComponents : protected Elementwise, protected Componentwise { +protected: + RT_API_ATTRS ElementsOverComponents(const Descriptor &instance, + const typeInfo::DerivedType &derived, const Descriptor *from = nullptr) + : Elementwise{instance, from}, Componentwise{derived} { + if (Componentwise::IsComplete()) { + Elementwise::SkipToEnd(); + } + } + RT_API_ATTRS bool IsComplete() const { return Elementwise::IsComplete(); } + RT_API_ATTRS void Advance() { + SkipToNextComponent(); + if (Componentwise::IsComplete()) { + Componentwise::Reset(); + Elementwise::Advance(); + } + } + RT_API_ATTRS void SkipToNextComponent() { + phase_ = 0; + Componentwise::Advance(); + } + RT_API_ATTRS void SkipToNextElement() { + phase_ = 0; + Componentwise::Reset(); + Elementwise::Advance(); + } + + int phase_{0}; +}; + +// Ticket worker classes + +// Implements derived type instance initialization +class InitializeTicket : public ImmediateTicketRunner, + private ComponentsOverElements { +public: + RT_API_ATTRS InitializeTicket( + const Descriptor &instance, const typeInfo::DerivedType &derived) + : ImmediateTicketRunner{*this}, + ComponentsOverElements{instance, derived} {} + RT_API_ATTRS int Begin(WorkQueue &); + RT_API_ATTRS int Continue(WorkQueue &); +}; + +// Initializes one derived type instance from the value of another +class InitializeCloneTicket + : public ImmediateTicketRunner, + private ComponentsOverElements { +public: + RT_API_ATTRS InitializeCloneTicket(const Descriptor &clone, + const Descriptor &original, const typeInfo::DerivedType &derived, + bool hasStat, const Descriptor *errMsg) + : ImmediateTicketRunner{*this}, + ComponentsOverElements{original, derived}, clone_{clone}, + hasStat_{hasStat}, errMsg_{errMsg} {} + RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; } + RT_API_ATTRS int Continue(WorkQueue &); + +private: + const Descriptor &clone_; + bool hasStat_{false}; + const Descriptor *errMsg_{nullptr}; + StaticDescriptor cloneComponentDescriptor_; +}; + +// Implements derived type instance finalization +class FinalizeTicket : public ImmediateTicketRunner, + private ComponentsOverElements { +public: + RT_API_ATTRS FinalizeTicket( + const Descriptor &instance, const typeInfo::DerivedType &derived) + : ImmediateTicketRunner{*this}, + ComponentsOverElements{instance, derived} {} + RT_API_ATTRS int Begin(WorkQueue &); + RT_API_ATTRS int Continue(WorkQueue &); + +private: + const typeInfo::DerivedType *finalizableParentType_{nullptr}; +}; + +// Implements derived type instance destruction +class DestroyTicket : public ImmediateTicketRunner, + private ComponentsOverElements { +public: + RT_API_ATTRS DestroyTicket(const Descriptor &instance, + const typeInfo::DerivedType &derived, bool finalize) + : ImmediateTicketRunner{*this}, + ComponentsOverElements{instance, derived}, finalize_{finalize} {} + RT_API_ATTRS int Begin(WorkQueue &); + RT_API_ATTRS int Continue(WorkQueue &); + +private: + bool finalize_{false}; +}; + +// Implements general intrinsic assignment +class AssignTicket : public ImmediateTicketRunner { +public: + RT_API_ATTRS AssignTicket( + Descriptor &to, const Descriptor &from, int flags, MemmoveFct memmoveFct) + : ImmediateTicketRunner{*this}, to_{to}, from_{&from}, + flags_{flags}, memmoveFct_{memmoveFct} {} + RT_API_ATTRS int Begin(WorkQueue &); + RT_API_ATTRS int Continue(WorkQueue &); + +private: + RT_API_ATTRS bool IsSimpleMemmove() const { + return !toDerived_ && to_.rank() == from_->rank() && to_.IsContiguous() && + from_->IsContiguous() && to_.ElementBytes() == from_->ElementBytes(); + } + RT_API_ATTRS Descriptor &GetTempDescriptor(); + + Descriptor &to_; + const Descriptor *from_{nullptr}; + int flags_{0}; // enum AssignFlags + MemmoveFct memmoveFct_{nullptr}; + StaticDescriptor tempDescriptor_; + const typeInfo::DerivedType *toDerived_{nullptr}; + Descriptor *toDeallocate_{nullptr}; + bool persist_{false}; + bool done_{false}; +}; + +// Implements derived type intrinsic assignment. +template +class DerivedAssignTicket + : public ImmediateTicketRunner>, + private std::conditional_t { +public: + using Base = std::conditional_t; + RT_API_ATTRS DerivedAssignTicket(const Descriptor &to, const Descriptor &from, + const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct, + Descriptor *deallocateAfter) + : ImmediateTicketRunner{*this}, + Base{to, derived, &from}, flags_{flags}, memmoveFct_{memmoveFct}, + deallocateAfter_{deallocateAfter} {} + RT_API_ATTRS int Begin(WorkQueue &); + RT_API_ATTRS int Continue(WorkQueue &); + +private: + static constexpr bool isComponentwise_{IS_COMPONENTWISE}; + bool toIsContiguous_{this->instance_.IsContiguous()}; + bool fromIsContiguous_{this->from_->IsContiguous()}; + int flags_{0}; + MemmoveFct memmoveFct_{nullptr}; + Descriptor *deallocateAfter_{nullptr}; + StaticDescriptor fromComponentDescriptor_; +}; + +namespace io::descr { + +template +class DescriptorIoTicket + : public ImmediateTicketRunner>, + private Elementwise { +public: + RT_API_ATTRS DescriptorIoTicket(io::IoStatementState &io, + const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table, + bool &anyIoTookPlace) + : ImmediateTicketRunner(*this), + Elementwise{descriptor}, io_{io}, table_{table}, + anyIoTookPlace_{anyIoTookPlace} {} + RT_API_ATTRS int Begin(WorkQueue &); + RT_API_ATTRS int Continue(WorkQueue &); + RT_API_ATTRS bool &anyIoTookPlace() { return anyIoTookPlace_; } + +private: + io::IoStatementState &io_; + const io::NonTbpDefinedIoTable *table_{nullptr}; + bool &anyIoTookPlace_; + common::optional nonTbpSpecial_; + const typeInfo::DerivedType *derived_{nullptr}; + const typeInfo::SpecialBinding *special_{nullptr}; + StaticDescriptor elementDescriptor_; +}; + +template +class DerivedIoTicket : public ImmediateTicketRunner>, + private ElementsOverComponents { +public: + RT_API_ATTRS DerivedIoTicket(io::IoStatementState &io, + const Descriptor &descriptor, const typeInfo::DerivedType &derived, + const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace) + : ImmediateTicketRunner(*this), + ElementsOverComponents{descriptor, derived}, io_{io}, table_{table}, + anyIoTookPlace_{anyIoTookPlace} {} + RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; } + RT_API_ATTRS int Continue(WorkQueue &); + +private: + io::IoStatementState &io_; + const io::NonTbpDefinedIoTable *table_{nullptr}; + bool &anyIoTookPlace_; +}; + +} // namespace io::descr + +struct NullTicket { + RT_API_ATTRS int Begin(WorkQueue &) const { return StatOk; } + RT_API_ATTRS int Continue(WorkQueue &) const { return StatOk; } +}; + +struct Ticket { + RT_API_ATTRS int Continue(WorkQueue &); + bool begun{false}; + std::variant, + DerivedAssignTicket, + io::descr::DescriptorIoTicket, + io::descr::DescriptorIoTicket, + io::descr::DerivedIoTicket, + io::descr::DerivedIoTicket> + u; +}; + +class WorkQueue { +public: + RT_API_ATTRS explicit WorkQueue(Terminator &terminator) + : terminator_{terminator} { + for (int j{1}; j < numStatic_; ++j) { + static_[j].previous = &static_[j - 1]; + static_[j - 1].next = &static_[j]; + } + } + RT_API_ATTRS ~WorkQueue(); + RT_API_ATTRS Terminator &terminator() { return terminator_; }; + + // APIs for particular tasks. These can return StatOk if the work is + // completed immediately. + RT_API_ATTRS int BeginInitialize( + const Descriptor &descriptor, const typeInfo::DerivedType &derived) { + if (runTicketsImmediately_) { + return InitializeTicket{descriptor, derived}.Run(*this); + } else { + StartTicket().u.emplace(descriptor, derived); + return StatContinue; + } + } + RT_API_ATTRS int BeginInitializeClone(const Descriptor &clone, + const Descriptor &original, const typeInfo::DerivedType &derived, + bool hasStat, const Descriptor *errMsg) { + if (runTicketsImmediately_) { + return InitializeCloneTicket{clone, original, derived, hasStat, errMsg} + .Run(*this); + } else { + StartTicket().u.emplace( + clone, original, derived, hasStat, errMsg); + return StatContinue; + } + } + RT_API_ATTRS int BeginFinalize( + const Descriptor &descriptor, const typeInfo::DerivedType &derived) { + if (runTicketsImmediately_) { + return FinalizeTicket{descriptor, derived}.Run(*this); + } else { + StartTicket().u.emplace(descriptor, derived); + return StatContinue; + } + } + RT_API_ATTRS int BeginDestroy(const Descriptor &descriptor, + const typeInfo::DerivedType &derived, bool finalize) { + if (runTicketsImmediately_) { + return DestroyTicket{descriptor, derived, finalize}.Run(*this); + } else { + StartTicket().u.emplace(descriptor, derived, finalize); + return StatContinue; + } + } + RT_API_ATTRS int BeginAssign(Descriptor &to, const Descriptor &from, + int flags, MemmoveFct memmoveFct) { + if (runTicketsImmediately_) { + return AssignTicket{to, from, flags, memmoveFct}.Run(*this); + } else { + StartTicket().u.emplace(to, from, flags, memmoveFct); + return StatContinue; + } + } + template + RT_API_ATTRS int BeginDerivedAssign(Descriptor &to, const Descriptor &from, + const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct, + Descriptor *deallocateAfter) { + if (runTicketsImmediately_) { + return DerivedAssignTicket{ + to, from, derived, flags, memmoveFct, deallocateAfter} + .Run(*this); + } else { + StartTicket().u.emplace>( + to, from, derived, flags, memmoveFct, deallocateAfter); + return StatContinue; + } + } + template + RT_API_ATTRS int BeginDescriptorIo(io::IoStatementState &io, + const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table, + bool &anyIoTookPlace) { + if (runTicketsImmediately_) { + return io::descr::DescriptorIoTicket{ + io, descriptor, table, anyIoTookPlace} + .Run(*this); + } else { + StartTicket().u.emplace>( + io, descriptor, table, anyIoTookPlace); + return StatContinue; + } + } + template + RT_API_ATTRS int BeginDerivedIo(io::IoStatementState &io, + const Descriptor &descriptor, const typeInfo::DerivedType &derived, + const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace) { + if (runTicketsImmediately_) { + return io::descr::DerivedIoTicket{ + io, descriptor, derived, table, anyIoTookPlace} + .Run(*this); + } else { + StartTicket().u.emplace>( + io, descriptor, derived, table, anyIoTookPlace); + return StatContinue; + } + } + + RT_API_ATTRS int Run(); + +private: +#if RT_DEVICE_COMPILATION + // Always use the work queue on a GPU device to avoid recursion. + static constexpr bool runTicketsImmediately_{false}; +#else + // Avoid the work queue overhead on the host, unless it needs + // debugging, which is so much easier there. + static constexpr bool runTicketsImmediately_{true}; +#endif + + // Most uses of the work queue won't go very deep. + static constexpr int numStatic_{2}; + + struct TicketList { + bool isStatic{true}; + Ticket ticket; + TicketList *previous{nullptr}, *next{nullptr}; + }; + + RT_API_ATTRS Ticket &StartTicket(); + RT_API_ATTRS void Stop(); + + Terminator &terminator_; + TicketList *first_{nullptr}, *last_{nullptr}, *insertAfter_{nullptr}; + TicketList static_[numStatic_]; + TicketList *firstFree_{static_}; +}; + +} // namespace Fortran::runtime +#endif // FLANG_RT_RUNTIME_WORK_QUEUE_H_ diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt index a3f63b4315644..332c0872e065f 100644 --- a/flang-rt/lib/runtime/CMakeLists.txt +++ b/flang-rt/lib/runtime/CMakeLists.txt @@ -68,6 +68,7 @@ set(supported_sources type-info.cpp unit.cpp utf.cpp + work-queue.cpp ) # List of source not used for GPU offloading. @@ -131,6 +132,7 @@ set(gpu_sources type-code.cpp type-info.cpp utf.cpp + work-queue.cpp complex-powi.cpp reduce.cpp reduction.cpp diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp index bf67b5dc8b645..41b130cc8f257 100644 --- a/flang-rt/lib/runtime/assign.cpp +++ b/flang-rt/lib/runtime/assign.cpp @@ -14,6 +14,7 @@ #include "flang-rt/runtime/terminator.h" #include "flang-rt/runtime/tools.h" #include "flang-rt/runtime/type-info.h" +#include "flang-rt/runtime/work-queue.h" namespace Fortran::runtime { @@ -102,11 +103,7 @@ static RT_API_ATTRS int AllocateAssignmentLHS( toDim.SetByteStride(stride); stride *= toDim.Extent(); } - int result{ReturnError(terminator, to.Allocate(kNoAsyncObject))}; - if (result == StatOk && derived && !derived->noInitializationNeeded()) { - result = ReturnError(terminator, Initialize(to, *derived, terminator)); - } - return result; + return ReturnError(terminator, to.Allocate(kNoAsyncObject)); } // least <= 0, most >= 0 @@ -231,6 +228,8 @@ static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to, } } +RT_OFFLOAD_API_GROUP_BEGIN + // Common implementation of assignments, both intrinsic assignments and // those cases of polymorphic user-defined ASSIGNMENT(=) TBPs that could not // be resolved in semantics. Most assignment statements do not need any @@ -244,275 +243,453 @@ static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to, // dealing with array constructors. RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from, Terminator &terminator, int flags, MemmoveFct memmoveFct) { - bool mustDeallocateLHS{(flags & DeallocateLHS) || - MustDeallocateLHS(to, from, terminator, flags)}; - DescriptorAddendum *toAddendum{to.Addendum()}; - const typeInfo::DerivedType *toDerived{ - toAddendum ? toAddendum->derivedType() : nullptr}; - if (toDerived && (flags & NeedFinalization) && - toDerived->noFinalizationNeeded()) { - flags &= ~NeedFinalization; - } - std::size_t toElementBytes{to.ElementBytes()}; - std::size_t fromElementBytes{from.ElementBytes()}; - // The following lambda definition violates the conding style, - // but cuda-11.8 nvcc hits an internal error with the brace initialization. - auto isSimpleMemmove = [&]() { - return !toDerived && to.rank() == from.rank() && to.IsContiguous() && - from.IsContiguous() && toElementBytes == fromElementBytes; - }; - StaticDescriptor deferredDeallocStatDesc; - Descriptor *deferDeallocation{nullptr}; - if (MayAlias(to, from)) { + WorkQueue workQueue{terminator}; + if (workQueue.BeginAssign(to, from, flags, memmoveFct) == StatContinue) { + workQueue.Run(); + } +} + +RT_API_ATTRS int AssignTicket::Begin(WorkQueue &workQueue) { + bool mustDeallocateLHS{(flags_ & DeallocateLHS) || + MustDeallocateLHS(to_, *from_, workQueue.terminator(), flags_)}; + DescriptorAddendum *toAddendum{to_.Addendum()}; + toDerived_ = toAddendum ? toAddendum->derivedType() : nullptr; + if (toDerived_ && (flags_ & NeedFinalization) && + toDerived_->noFinalizationNeeded()) { + flags_ &= ~NeedFinalization; + } + if (MayAlias(to_, *from_)) { if (mustDeallocateLHS) { - deferDeallocation = &deferredDeallocStatDesc.descriptor(); + // Convert the LHS into a temporary, then make it look deallocated. + toDeallocate_ = &tempDescriptor_.descriptor(); + persist_ = true; // tempDescriptor_ state must outlive child tickets std::memcpy( - reinterpret_cast(deferDeallocation), &to, to.SizeInBytes()); - to.set_base_addr(nullptr); - } else if (!isSimpleMemmove()) { + reinterpret_cast(toDeallocate_), &to_, to_.SizeInBytes()); + to_.set_base_addr(nullptr); + if (toDerived_ && (flags_ & NeedFinalization)) { + if (int status{workQueue.BeginFinalize(*toDeallocate_, *toDerived_)}; + status != StatOk && status != StatContinue) { + return status; + } + flags_ &= ~NeedFinalization; + } + } else if (!IsSimpleMemmove()) { // Handle LHS/RHS aliasing by copying RHS into a temp, then // recursively assigning from that temp. - auto descBytes{from.SizeInBytes()}; - StaticDescriptor staticDesc; - Descriptor &newFrom{staticDesc.descriptor()}; - std::memcpy(reinterpret_cast(&newFrom), &from, descBytes); + auto descBytes{from_->SizeInBytes()}; + Descriptor &newFrom{tempDescriptor_.descriptor()}; + persist_ = true; // tempDescriptor_ state must outlive child tickets + std::memcpy(reinterpret_cast(&newFrom), from_, descBytes); // Pretend the temporary descriptor is for an ALLOCATABLE // entity, otherwise, the Deallocate() below will not // free the descriptor memory. newFrom.raw().attribute = CFI_attribute_allocatable; - auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncObject))}; - if (stat == StatOk) { - if (HasDynamicComponent(from)) { - // If 'from' has allocatable/automatic component, we cannot - // just make a shallow copy of the descriptor member. - // This will still leave data overlap in 'to' and 'newFrom'. - // For example: - // type t - // character, allocatable :: c(:) - // end type t - // type(t) :: x(3) - // x(2:3) = x(1:2) - // We have to make a deep copy into 'newFrom' in this case. - RTNAME(AssignTemporary) - (newFrom, from, terminator.sourceFileName(), terminator.sourceLine()); - } else { - ShallowCopy(newFrom, from, true, from.IsContiguous()); + if (int stat{ReturnError( + workQueue.terminator(), newFrom.Allocate(kNoAsyncObject))}; + stat != StatOk) { + return stat; + } + if (HasDynamicComponent(*from_)) { + // If 'from' has allocatable/automatic component, we cannot + // just make a shallow copy of the descriptor member. + // This will still leave data overlap in 'to' and 'newFrom'. + // For example: + // type t + // character, allocatable :: c(:) + // end type t + // type(t) :: x(3) + // x(2:3) = x(1:2) + // We have to make a deep copy into 'newFrom' in this case. + if (const DescriptorAddendum *addendum{newFrom.Addendum()}) { + if (const auto *derived{addendum->derivedType()}) { + if (!derived->noInitializationNeeded()) { + if (int status{workQueue.BeginInitialize(newFrom, *derived)}; + status != StatOk && status != StatContinue) { + return status; + } + } + } + } + static constexpr int nestedFlags{MaybeReallocate | PolymorphicLHS}; + if (int status{workQueue.BeginAssign( + newFrom, *from_, nestedFlags, memmoveFct_)}; + status != StatOk && status != StatContinue) { + return status; } - Assign(to, newFrom, terminator, - flags & - (NeedFinalization | ComponentCanBeDefinedAssignment | - ExplicitLengthCharacterLHS | CanBeDefinedAssignment)); - newFrom.Deallocate(); + } else { + ShallowCopy(newFrom, *from_, true, from_->IsContiguous()); } - return; + from_ = &newFrom; + flags_ &= NeedFinalization | ComponentCanBeDefinedAssignment | + ExplicitLengthCharacterLHS | CanBeDefinedAssignment; + toDeallocate_ = &newFrom; } } - if (to.IsAllocatable()) { + if (to_.IsAllocatable()) { if (mustDeallocateLHS) { - if (deferDeallocation) { - if ((flags & NeedFinalization) && toDerived) { - Finalize(*deferDeallocation, *toDerived, &terminator); - flags &= ~NeedFinalization; - } - } else { - to.Destroy((flags & NeedFinalization) != 0, /*destroyPointers=*/false, - &terminator); - flags &= ~NeedFinalization; + if (!toDeallocate_ && to_.IsAllocated()) { + toDeallocate_ = &to_; } - } else if (to.rank() != from.rank() && !to.IsAllocated()) { - terminator.Crash("Assign: mismatched ranks (%d != %d) in assignment to " - "unallocated allocatable", - to.rank(), from.rank()); + } else if (to_.rank() != from_->rank() && !to_.IsAllocated()) { + workQueue.terminator().Crash("Assign: mismatched ranks (%d != %d) in " + "assignment to unallocated allocatable", + to_.rank(), from_->rank()); } - if (!to.IsAllocated()) { - if (AllocateAssignmentLHS(to, from, terminator, flags) != StatOk) { - return; + } else if (!to_.IsAllocated()) { + workQueue.terminator().Crash( + "Assign: left-hand side variable is neither allocated nor allocatable"); + } + if (toDerived_ && to_.IsAllocated()) { + // Schedule finalization or destruction of the LHS. + if (flags_ & NeedFinalization) { + if (int status{workQueue.BeginFinalize(to_, *toDerived_)}; + status != StatOk && status != StatContinue) { + return status; + } + } else if (!toDerived_->noDestructionNeeded()) { + if (int status{ + workQueue.BeginDestroy(to_, *toDerived_, /*finalize=*/false)}; + status != StatOk && status != StatContinue) { + return status; } - flags &= ~NeedFinalization; - toElementBytes = to.ElementBytes(); // may have changed - toDerived = toAddendum ? toAddendum->derivedType() : nullptr; } } - if (toDerived && (flags & CanBeDefinedAssignment)) { - // Check for a user-defined assignment type-bound procedure; - // see 10.2.1.4-5. A user-defined assignment TBP defines all of - // the semantics, including allocatable (re)allocation and any - // finalization. - // - // Note that the aliasing and LHS (re)allocation handling above - // needs to run even with CanBeDefinedAssignment flag, when - // the Assign() is invoked recursively for component-per-component - // assignments. - if (to.rank() == 0) { - if (const auto *special{toDerived->FindSpecialBinding( + return StatContinue; +} + +RT_API_ATTRS int AssignTicket::Continue(WorkQueue &workQueue) { + if (done_) { + // All child tickets are complete; can release this ticket's state. + if (toDeallocate_) { + toDeallocate_->Deallocate(); + } + return StatOk; + } + // All necessary finalization or destruction that was initiated by Begin() + // has been completed. Deallocation may be pending, and if it's for the LHS, + // do it now so that the LHS gets reallocated. + if (toDeallocate_ == &to_) { + toDeallocate_ = nullptr; + to_.Deallocate(); + } + // Allocate the LHS if needed + if (!to_.IsAllocated()) { + if (int stat{ + AllocateAssignmentLHS(to_, *from_, workQueue.terminator(), flags_)}; + stat != StatOk) { + return stat; + } + const auto *addendum{to_.Addendum()}; + toDerived_ = addendum ? addendum->derivedType() : nullptr; + if (toDerived_ && !toDerived_->noInitializationNeeded()) { + if (int status{workQueue.BeginInitialize(to_, *toDerived_)}; + status != StatOk) { + return status; + } + } + } + // Check for a user-defined assignment type-bound procedure; + // see 10.2.1.4-5. + // Note that the aliasing and LHS (re)allocation handling above + // needs to run even with CanBeDefinedAssignment flag, since + // Assign() can be invoked recursively for component-wise assignments. + if (toDerived_ && (flags_ & CanBeDefinedAssignment)) { + if (to_.rank() == 0) { + if (const auto *special{toDerived_->FindSpecialBinding( typeInfo::SpecialBinding::Which::ScalarAssignment)}) { - return DoScalarDefinedAssignment(to, from, *special); + DoScalarDefinedAssignment(to_, *from_, *special); + done_ = true; + return StatContinue; } } - if (const auto *special{toDerived->FindSpecialBinding( + if (const auto *special{toDerived_->FindSpecialBinding( typeInfo::SpecialBinding::Which::ElementalAssignment)}) { - return DoElementalDefinedAssignment(to, from, *toDerived, *special); + DoElementalDefinedAssignment(to_, *from_, *toDerived_, *special); + done_ = true; + return StatContinue; } } - SubscriptValue toAt[maxRank]; - to.GetLowerBounds(toAt); - // Scalar expansion of the RHS is implied by using the same empty - // subscript values on each (seemingly) elemental reference into - // "from". - SubscriptValue fromAt[maxRank]; - from.GetLowerBounds(fromAt); - std::size_t toElements{to.Elements()}; - if (from.rank() > 0 && toElements != from.Elements()) { - terminator.Crash("Assign: mismatching element counts in array assignment " - "(to %zd, from %zd)", - toElements, from.Elements()); + // Intrinsic assignment + std::size_t toElements{to_.Elements()}; + if (from_->rank() > 0 && toElements != from_->Elements()) { + workQueue.terminator().Crash("Assign: mismatching element counts in array " + "assignment (to %zd, from %zd)", + toElements, from_->Elements()); } - if (to.type() != from.type()) { - terminator.Crash("Assign: mismatching types (to code %d != from code %d)", - to.type().raw(), from.type().raw()); + if (to_.type() != from_->type()) { + workQueue.terminator().Crash( + "Assign: mismatching types (to code %d != from code %d)", + to_.type().raw(), from_->type().raw()); } - if (toElementBytes > fromElementBytes && !to.type().IsCharacter()) { - terminator.Crash("Assign: mismatching non-character element sizes (to %zd " - "bytes != from %zd bytes)", + std::size_t toElementBytes{to_.ElementBytes()}; + std::size_t fromElementBytes{from_->ElementBytes()}; + if (toElementBytes > fromElementBytes && !to_.type().IsCharacter()) { + workQueue.terminator().Crash("Assign: mismatching non-character element " + "sizes (to %zd bytes != from %zd bytes)", toElementBytes, fromElementBytes); } - if (const typeInfo::DerivedType * - updatedToDerived{toAddendum ? toAddendum->derivedType() : nullptr}) { - // Derived type intrinsic assignment, which is componentwise and elementwise - // for all components, including parent components (10.2.1.2-3). - // The target is first finalized if still necessary (7.5.6.3(1)) - if (flags & NeedFinalization) { - Finalize(to, *updatedToDerived, &terminator); - } else if (updatedToDerived && !updatedToDerived->noDestructionNeeded()) { - Destroy(to, /*finalize=*/false, *updatedToDerived, &terminator); - } - // Copy the data components (incl. the parent) first. - const Descriptor &componentDesc{updatedToDerived->component()}; - std::size_t numComponents{componentDesc.Elements()}; - for (std::size_t j{0}; j < toElements; - ++j, to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) { - for (std::size_t k{0}; k < numComponents; ++k) { - const auto &comp{ - *componentDesc.ZeroBasedIndexedElement( - k)}; // TODO: exploit contiguity here - // Use PolymorphicLHS for components so that the right things happen - // when the components are polymorphic; when they're not, they're both - // not, and their declared types will match. - int nestedFlags{MaybeReallocate | PolymorphicLHS}; - if (flags & ComponentCanBeDefinedAssignment) { - nestedFlags |= - CanBeDefinedAssignment | ComponentCanBeDefinedAssignment; - } - switch (comp.genre()) { - case typeInfo::Component::Genre::Data: - if (comp.category() == TypeCategory::Derived) { - StaticDescriptor statDesc[2]; - Descriptor &toCompDesc{statDesc[0].descriptor()}; - Descriptor &fromCompDesc{statDesc[1].descriptor()}; - comp.CreatePointerDescriptor(toCompDesc, to, terminator, toAt); - comp.CreatePointerDescriptor( - fromCompDesc, from, terminator, fromAt); - Assign(toCompDesc, fromCompDesc, terminator, nestedFlags); - } else { // Component has intrinsic type; simply copy raw bytes - std::size_t componentByteSize{comp.SizeInBytes(to)}; - memmoveFct(to.Element(toAt) + comp.offset(), - from.Element(fromAt) + comp.offset(), - componentByteSize); - } - break; - case typeInfo::Component::Genre::Pointer: { - std::size_t componentByteSize{comp.SizeInBytes(to)}; - memmoveFct(to.Element(toAt) + comp.offset(), - from.Element(fromAt) + comp.offset(), - componentByteSize); - } break; - case typeInfo::Component::Genre::Allocatable: - case typeInfo::Component::Genre::Automatic: { - auto *toDesc{reinterpret_cast( - to.Element(toAt) + comp.offset())}; - const auto *fromDesc{reinterpret_cast( - from.Element(fromAt) + comp.offset())}; - // Allocatable components of the LHS are unconditionally - // deallocated before assignment (F'2018 10.2.1.3(13)(1)), - // unlike a "top-level" assignment to a variable, where - // deallocation is optional. - // - // Be careful not to destroy/reallocate the LHS, if there is - // overlap between LHS and RHS (it seems that partial overlap - // is not possible, though). - // Invoke Assign() recursively to deal with potential aliasing. - if (toDesc->IsAllocatable()) { - if (!fromDesc->IsAllocated()) { - // No aliasing. - // - // If to is not allocated, the Destroy() call is a no-op. - // This is just a shortcut, because the recursive Assign() - // below would initiate the destruction for to. - // No finalization is required. - toDesc->Destroy( - /*finalize=*/false, /*destroyPointers=*/false, &terminator); - continue; // F'2018 10.2.1.3(13)(2) - } - } - // Force LHS deallocation with DeallocateLHS flag. - // The actual deallocation may be avoided, if the existing - // location can be reoccupied. - Assign(*toDesc, *fromDesc, terminator, nestedFlags | DeallocateLHS); - } break; - } + if (toDerived_) { + if (toDerived_->noDefinedAssignment()) { // componentwise + if (int status{workQueue.BeginDerivedAssign( + to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)}; + status != StatOk && status != StatContinue) { + return status; } - // Copy procedure pointer components - const Descriptor &procPtrDesc{updatedToDerived->procPtr()}; - std::size_t numProcPtrs{procPtrDesc.Elements()}; - for (std::size_t k{0}; k < numProcPtrs; ++k) { - const auto &procPtr{ - *procPtrDesc.ZeroBasedIndexedElement( - k)}; - memmoveFct(to.Element(toAt) + procPtr.offset, - from.Element(fromAt) + procPtr.offset, - sizeof(typeInfo::ProcedurePointer)); + } else { // elementwise + if (int status{workQueue.BeginDerivedAssign( + to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)}; + status != StatOk && status != StatContinue) { + return status; } } - } else { // intrinsic type, intrinsic assignment - if (isSimpleMemmove()) { - memmoveFct(to.raw().base_addr, from.raw().base_addr, - toElements * toElementBytes); - } else if (toElementBytes > fromElementBytes) { // blank padding - switch (to.type().raw()) { + toDeallocate_ = nullptr; + } else if (IsSimpleMemmove()) { + memmoveFct_(to_.raw().base_addr, from_->raw().base_addr, + toElements * toElementBytes); + } else { + // Scalar expansion of the RHS is implied by using the same empty + // subscript values on each (seemingly) elemental reference into + // "from". + SubscriptValue toAt[maxRank]; + to_.GetLowerBounds(toAt); + SubscriptValue fromAt[maxRank]; + from_->GetLowerBounds(fromAt); + if (toElementBytes > fromElementBytes) { // blank padding + switch (to_.type().raw()) { case CFI_type_signed_char: case CFI_type_char: - BlankPadCharacterAssignment(to, from, toAt, fromAt, toElements, + BlankPadCharacterAssignment(to_, *from_, toAt, fromAt, toElements, toElementBytes, fromElementBytes); break; case CFI_type_char16_t: - BlankPadCharacterAssignment(to, from, toAt, fromAt, + BlankPadCharacterAssignment(to_, *from_, toAt, fromAt, toElements, toElementBytes, fromElementBytes); break; case CFI_type_char32_t: - BlankPadCharacterAssignment(to, from, toAt, fromAt, + BlankPadCharacterAssignment(to_, *from_, toAt, fromAt, toElements, toElementBytes, fromElementBytes); break; default: - terminator.Crash("unexpected type code %d in blank padded Assign()", - to.type().raw()); + workQueue.terminator().Crash( + "unexpected type code %d in blank padded Assign()", + to_.type().raw()); } } else { // elemental copies, possibly with character truncation for (std::size_t n{toElements}; n-- > 0; - to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) { - memmoveFct(to.Element(toAt), from.Element(fromAt), + to_.IncrementSubscripts(toAt), from_->IncrementSubscripts(fromAt)) { + memmoveFct_(to_.Element(toAt), from_->Element(fromAt), toElementBytes); } } } - if (deferDeallocation) { - // deferDeallocation is used only when LHS is an allocatable. - // The finalization has already been run for it. - deferDeallocation->Destroy( - /*finalize=*/false, /*destroyPointers=*/false, &terminator); + if (persist_) { + done_ = true; + return StatContinue; + } else { + if (toDeallocate_) { + toDeallocate_->Deallocate(); + toDeallocate_ = nullptr; + } + return StatOk; } } -RT_OFFLOAD_API_GROUP_BEGIN +template +RT_API_ATTRS int DerivedAssignTicket::Begin( + WorkQueue &workQueue) { + if (toIsContiguous_ && fromIsContiguous_ && + this->derived_.noDestructionNeeded() && + this->derived_.noDefinedAssignment() && + this->instance_.rank() == this->from_->rank()) { + if (std::size_t elementBytes{this->instance_.ElementBytes()}; + elementBytes == this->from_->ElementBytes()) { + // Fastest path. Both LHS and RHS are contiguous, RHS is not a scalar + // to be expanded, the types have the same size, and there are no + // allocatable components or defined ASSIGNMENT(=) at any level. + memmoveFct_(this->instance_.template OffsetElement(), + this->from_->template OffsetElement(), + this->instance_.Elements() * elementBytes); + return StatOk; + } + } + // Use PolymorphicLHS for components so that the right things happen + // when the components are polymorphic; when they're not, they're both + // not, and their declared types will match. + int nestedFlags{MaybeReallocate | PolymorphicLHS}; + if (flags_ & ComponentCanBeDefinedAssignment) { + nestedFlags |= CanBeDefinedAssignment | ComponentCanBeDefinedAssignment; + } + flags_ = nestedFlags; + // Copy procedure pointer components + const Descriptor &procPtrDesc{this->derived_.procPtr()}; + bool noDataComponents{this->IsComplete()}; + if (std::size_t numProcPtrs{procPtrDesc.Elements()}) { + for (std::size_t k{0}; k < numProcPtrs; ++k) { + const auto &procPtr{ + *procPtrDesc.ZeroBasedIndexedElement(k)}; + // Loop only over elements + if (noDataComponents) { + Elementwise::Reset(); + } + for (; !Elementwise::IsComplete(); Elementwise::Advance()) { + memmoveFct_(this->instance_.template ElementComponent( + this->subscripts_, procPtr.offset), + this->from_->template ElementComponent( + this->fromSubscripts_, procPtr.offset), + sizeof(typeInfo::ProcedurePointer)); + } + } + if (noDataComponents) { + return StatOk; + } + Elementwise::Reset(); + } + if (noDataComponents) { + return StatOk; + } + return StatContinue; +} +template RT_API_ATTRS int DerivedAssignTicket::Begin(WorkQueue &); +template RT_API_ATTRS int DerivedAssignTicket::Begin(WorkQueue &); + +template +RT_API_ATTRS int DerivedAssignTicket::Continue( + WorkQueue &workQueue) { + while (!this->IsComplete()) { + // Copy the data components (incl. the parent) first. + switch (this->component_->genre()) { + case typeInfo::Component::Genre::Data: + if (this->component_->category() == TypeCategory::Derived) { + Descriptor &toCompDesc{this->componentDescriptor_.descriptor()}; + Descriptor &fromCompDesc{this->fromComponentDescriptor_.descriptor()}; + this->component_->CreatePointerDescriptor(toCompDesc, this->instance_, + workQueue.terminator(), this->subscripts_); + this->component_->CreatePointerDescriptor(fromCompDesc, *this->from_, + workQueue.terminator(), this->fromSubscripts_); + this->Advance(); + if (int status{workQueue.BeginAssign( + toCompDesc, fromCompDesc, flags_, memmoveFct_)}; + status != StatOk) { + return status; + } + } else { // Component has intrinsic type; simply copy raw bytes + std::size_t componentByteSize{ + this->component_->SizeInBytes(this->instance_)}; + if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) { + std::size_t offset{this->component_->offset()}; + char *to{this->instance_.template OffsetElement(offset)}; + const char *from{ + this->from_->template OffsetElement(offset)}; + std::size_t toElementStride{this->instance_.ElementBytes()}; + std::size_t fromElementStride{ + this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()}; + if (toElementStride == fromElementStride && + toElementStride == componentByteSize) { + memmoveFct_(to, from, this->elements_ * componentByteSize); + } else { + for (std::size_t n{this->elements_}; n--; + to += toElementStride, from += fromElementStride) { + memmoveFct_(to, from, componentByteSize); + } + } + this->Componentwise::Advance(); + } else { + memmoveFct_( + this->instance_.template Element(this->subscripts_) + + this->component_->offset(), + this->from_->template Element(this->fromSubscripts_) + + this->component_->offset(), + componentByteSize); + this->Advance(); + } + } + break; + case typeInfo::Component::Genre::Pointer: { + std::size_t componentByteSize{ + this->component_->SizeInBytes(this->instance_)}; + if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) { + std::size_t offset{this->component_->offset()}; + char *to{this->instance_.template OffsetElement(offset)}; + const char *from{ + this->from_->template OffsetElement(offset)}; + std::size_t toElementStride{this->instance_.ElementBytes()}; + std::size_t fromElementStride{ + this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()}; + if (toElementStride == fromElementStride && + toElementStride == componentByteSize) { + memmoveFct_(to, from, this->elements_ * componentByteSize); + } else { + for (std::size_t n{this->elements_}; n--; + to += toElementStride, from += fromElementStride) { + memmoveFct_(to, from, componentByteSize); + } + } + this->Componentwise::Advance(); + } else { + memmoveFct_(this->instance_.template Element(this->subscripts_) + + this->component_->offset(), + this->from_->template Element(this->fromSubscripts_) + + this->component_->offset(), + componentByteSize); + this->Advance(); + } + } break; + case typeInfo::Component::Genre::Allocatable: + case typeInfo::Component::Genre::Automatic: { + auto *toDesc{reinterpret_cast( + this->instance_.template Element(this->subscripts_) + + this->component_->offset())}; + const auto *fromDesc{reinterpret_cast( + this->from_->template Element(this->fromSubscripts_) + + this->component_->offset())}; + if (toDesc->IsAllocatable() && !fromDesc->IsAllocated()) { + if (toDesc->IsAllocated()) { + if (this->phase_ == 0) { + this->phase_++; + if (const auto *componentDerived{this->component_->derivedType()}; + componentDerived && !componentDerived->noDestructionNeeded()) { + if (int status{workQueue.BeginDestroy( + *toDesc, *componentDerived, /*finalize=*/false)}; + status != StatOk) { + return status; + } + } + } + toDesc->Deallocate(); + } + this->Advance(); + } else { + // Allocatable components of the LHS are unconditionally + // deallocated before assignment (F'2018 10.2.1.3(13)(1)), + // unlike a "top-level" assignment to a variable, where + // deallocation is optional. + this->Advance(); + int nestedFlags{flags_}; + if (this->derived_.noFinalizationNeeded() && + this->derived_.noInitializationNeeded() && + this->derived_.noDestructionNeeded()) { + // The actual deallocation may be avoided, if the existing + // location can be reoccupied. + } else { + // Force LHS deallocation with DeallocateLHS flag. + nestedFlags |= DeallocateLHS; + } + if (int status{workQueue.BeginAssign( + *toDesc, *fromDesc, nestedFlags, memmoveFct_)}; + status != StatOk) { + return status; + } + } + } break; + } + } + if (deallocateAfter_) { + deallocateAfter_->Deallocate(); + } + return StatOk; +} +template RT_API_ATTRS int DerivedAssignTicket::Continue(WorkQueue &); +template RT_API_ATTRS int DerivedAssignTicket::Continue(WorkQueue &); RT_API_ATTRS void DoFromSourceAssign(Descriptor &alloc, const Descriptor &source, Terminator &terminator, MemmoveFct memmoveFct) { @@ -582,7 +759,6 @@ void RTDEF(AssignTemporary)(Descriptor &to, const Descriptor &from, } } } - Assign(to, from, terminator, MaybeReallocate | PolymorphicLHS); } @@ -599,7 +775,6 @@ void RTDEF(CopyInAssign)(Descriptor &temp, const Descriptor &var, void RTDEF(CopyOutAssign)( Descriptor *var, Descriptor &temp, const char *sourceFile, int sourceLine) { Terminator terminator{sourceFile, sourceLine}; - // Copyout from the temporary must not cause any finalizations // for LHS. The variable must be properly initialized already. if (var) { diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp index 35037036f63e7..8ab737c701b01 100644 --- a/flang-rt/lib/runtime/derived.cpp +++ b/flang-rt/lib/runtime/derived.cpp @@ -12,6 +12,7 @@ #include "flang-rt/runtime/terminator.h" #include "flang-rt/runtime/tools.h" #include "flang-rt/runtime/type-info.h" +#include "flang-rt/runtime/work-queue.h" namespace Fortran::runtime { @@ -30,180 +31,193 @@ static RT_API_ATTRS void GetComponentExtents(SubscriptValue (&extents)[maxRank], } RT_API_ATTRS int Initialize(const Descriptor &instance, - const typeInfo::DerivedType &derived, Terminator &terminator, bool hasStat, - const Descriptor *errMsg) { - const Descriptor &componentDesc{derived.component()}; - std::size_t elements{instance.Elements()}; - int stat{StatOk}; - // Initialize data components in each element; the per-element iterations - // constitute the inner loops, not the outer ones - std::size_t myComponents{componentDesc.Elements()}; - for (std::size_t k{0}; k < myComponents; ++k) { - const auto &comp{ - *componentDesc.ZeroBasedIndexedElement(k)}; - SubscriptValue at[maxRank]; - instance.GetLowerBounds(at); - if (comp.genre() == typeInfo::Component::Genre::Allocatable || - comp.genre() == typeInfo::Component::Genre::Automatic) { - for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) { - Descriptor &allocDesc{ - *instance.ElementComponent(at, comp.offset())}; - comp.EstablishDescriptor(allocDesc, instance, terminator); + const typeInfo::DerivedType &derived, Terminator &terminator, bool, + const Descriptor *) { + WorkQueue workQueue{terminator}; + int status{workQueue.BeginInitialize(instance, derived)}; + return status == StatContinue ? workQueue.Run() : status; +} + +RT_API_ATTRS int InitializeTicket::Begin(WorkQueue &) { + // Initialize procedure pointer components in each element + const Descriptor &procPtrDesc{derived_.procPtr()}; + if (std::size_t numProcPtrs{procPtrDesc.Elements()}) { + bool noDataComponents{IsComplete()}; + for (std::size_t k{0}; k < numProcPtrs; ++k) { + const auto &comp{ + *procPtrDesc.ZeroBasedIndexedElement(k)}; + // Loop only over elements + if (noDataComponents) { + Elementwise::Reset(); + } + for (; !Elementwise::IsComplete(); Elementwise::Advance()) { + auto &pptr{*instance_.ElementComponent( + subscripts_, comp.offset)}; + pptr = comp.procInitialization; + } + } + if (noDataComponents) { + return StatOk; + } + Elementwise::Reset(); + } + return StatContinue; +} + +RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) { + while (!IsComplete()) { + if (component_->genre() == typeInfo::Component::Genre::Allocatable) { + // Establish allocatable descriptors + for (; !Elementwise::IsComplete(); Elementwise::Advance()) { + Descriptor &allocDesc{*instance_.ElementComponent( + subscripts_, component_->offset())}; + component_->EstablishDescriptor( + allocDesc, instance_, workQueue.terminator()); allocDesc.raw().attribute = CFI_attribute_allocatable; - if (comp.genre() == typeInfo::Component::Genre::Automatic) { - stat = ReturnError( - terminator, allocDesc.Allocate(kNoAsyncObject), errMsg, hasStat); - if (stat == StatOk) { - if (const DescriptorAddendum * addendum{allocDesc.Addendum()}) { - if (const auto *derived{addendum->derivedType()}) { - if (!derived->noInitializationNeeded()) { - stat = Initialize( - allocDesc, *derived, terminator, hasStat, errMsg); - } - } - } - } - if (stat != StatOk) { - break; - } - } } - } else if (const void *init{comp.initialization()}) { + SkipToNextComponent(); + } else if (const void *init{component_->initialization()}) { // Explicit initialization of data pointers and // non-allocatable non-automatic components - std::size_t bytes{comp.SizeInBytes(instance)}; - for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) { - char *ptr{instance.ElementComponent(at, comp.offset())}; + std::size_t bytes{component_->SizeInBytes(instance_)}; + for (; !Elementwise::IsComplete(); Elementwise::Advance()) { + char *ptr{instance_.ElementComponent( + subscripts_, component_->offset())}; std::memcpy(ptr, init, bytes); } - } else if (comp.genre() == typeInfo::Component::Genre::Pointer) { + SkipToNextComponent(); + } else if (component_->genre() == typeInfo::Component::Genre::Pointer) { // Data pointers without explicit initialization are established // so that they are valid right-hand side targets of pointer // assignment statements. - for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) { - Descriptor &ptrDesc{ - *instance.ElementComponent(at, comp.offset())}; - comp.EstablishDescriptor(ptrDesc, instance, terminator); + for (; !Elementwise::IsComplete(); Elementwise::Advance()) { + Descriptor &ptrDesc{*instance_.ElementComponent( + subscripts_, component_->offset())}; + component_->EstablishDescriptor( + ptrDesc, instance_, workQueue.terminator()); ptrDesc.raw().attribute = CFI_attribute_pointer; } - } else if (comp.genre() == typeInfo::Component::Genre::Data && - comp.derivedType() && !comp.derivedType()->noInitializationNeeded()) { + SkipToNextComponent(); + } else if (component_->genre() == typeInfo::Component::Genre::Data && + component_->derivedType() && + !component_->derivedType()->noInitializationNeeded()) { // Default initialization of non-pointer non-allocatable/automatic - // data component. Handles parent component's elements. Recursive. + // data component. Handles parent component's elements. SubscriptValue extents[maxRank]; - GetComponentExtents(extents, comp, instance); - StaticDescriptor staticDescriptor; - Descriptor &compDesc{staticDescriptor.descriptor()}; - const typeInfo::DerivedType &compType{*comp.derivedType()}; - for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) { - compDesc.Establish(compType, - instance.ElementComponent(at, comp.offset()), comp.rank(), - extents); - stat = Initialize(compDesc, compType, terminator, hasStat, errMsg); - if (stat != StatOk) { - break; - } + GetComponentExtents(extents, *component_, instance_); + Descriptor &compDesc{componentDescriptor_.descriptor()}; + const typeInfo::DerivedType &compType{*component_->derivedType()}; + compDesc.Establish(compType, + instance_.ElementComponent(subscripts_, component_->offset()), + component_->rank(), extents); + Advance(); + if (int status{workQueue.BeginInitialize(compDesc, compType)}; + status != StatOk) { + return status; } + } else { + SkipToNextComponent(); } } - // Initialize procedure pointer components in each element - const Descriptor &procPtrDesc{derived.procPtr()}; - std::size_t myProcPtrs{procPtrDesc.Elements()}; - for (std::size_t k{0}; k < myProcPtrs; ++k) { - const auto &comp{ - *procPtrDesc.ZeroBasedIndexedElement(k)}; - SubscriptValue at[maxRank]; - instance.GetLowerBounds(at); - for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) { - auto &pptr{*instance.ElementComponent( - at, comp.offset)}; - pptr = comp.procInitialization; - } - } - return stat; + return StatOk; } RT_API_ATTRS int InitializeClone(const Descriptor &clone, - const Descriptor &orig, const typeInfo::DerivedType &derived, + const Descriptor &original, const typeInfo::DerivedType &derived, Terminator &terminator, bool hasStat, const Descriptor *errMsg) { - const Descriptor &componentDesc{derived.component()}; - std::size_t elements{orig.Elements()}; - int stat{StatOk}; - - // Skip pointers and unallocated variables. - if (orig.IsPointer() || !orig.IsAllocated()) { - return stat; + if (original.IsPointer() || !original.IsAllocated()) { + return StatOk; // nothing to do + } else { + WorkQueue workQueue{terminator}; + int status{workQueue.BeginInitializeClone( + clone, original, derived, hasStat, errMsg)}; + return status == StatContinue ? workQueue.Run() : status; } - // Initialize each data component. - std::size_t components{componentDesc.Elements()}; - for (std::size_t i{0}; i < components; ++i) { - const typeInfo::Component &comp{ - *componentDesc.ZeroBasedIndexedElement(i)}; - SubscriptValue at[maxRank]; - orig.GetLowerBounds(at); - // Allocate allocatable components that are also allocated in the original - // object. - if (comp.genre() == typeInfo::Component::Genre::Allocatable) { - // Initialize each element. - for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) { - Descriptor &origDesc{ - *orig.ElementComponent(at, comp.offset())}; - Descriptor &cloneDesc{ - *clone.ElementComponent(at, comp.offset())}; - if (origDesc.IsAllocated()) { +} + +RT_API_ATTRS int InitializeCloneTicket::Continue(WorkQueue &workQueue) { + while (!IsComplete()) { + if (component_->genre() == typeInfo::Component::Genre::Allocatable) { + Descriptor &origDesc{*instance_.ElementComponent( + subscripts_, component_->offset())}; + if (origDesc.IsAllocated()) { + Descriptor &cloneDesc{*clone_.ElementComponent( + subscripts_, component_->offset())}; + if (phase_ == 0) { + ++phase_; cloneDesc.ApplyMold(origDesc, origDesc.rank()); - stat = ReturnError( - terminator, cloneDesc.Allocate(kNoAsyncObject), errMsg, hasStat); - if (stat == StatOk) { - if (const DescriptorAddendum * addendum{cloneDesc.Addendum()}) { - if (const typeInfo::DerivedType * - derived{addendum->derivedType()}) { - if (!derived->noInitializationNeeded()) { - // Perform default initialization for the allocated element. - stat = Initialize( - cloneDesc, *derived, terminator, hasStat, errMsg); - } - // Initialize derived type's allocatables. - if (stat == StatOk) { - stat = InitializeClone(cloneDesc, origDesc, *derived, - terminator, hasStat, errMsg); + if (int stat{ReturnError(workQueue.terminator(), + cloneDesc.Allocate(kNoAsyncObject), errMsg_, hasStat_)}; + stat != StatOk) { + return stat; + } + if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) { + if (const typeInfo::DerivedType *derived{addendum->derivedType()}) { + if (!derived->noInitializationNeeded()) { + // Perform default initialization for the allocated element. + if (int status{workQueue.BeginInitialize(cloneDesc, *derived)}; + status != StatOk) { + return status; } } } } } - if (stat != StatOk) { - break; + if (phase_ == 1) { + ++phase_; + if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) { + if (const typeInfo::DerivedType *derived{addendum->derivedType()}) { + // Initialize derived type's allocatables. + if (int status{workQueue.BeginInitializeClone( + cloneDesc, origDesc, *derived, hasStat_, errMsg_)}; + status != StatOk) { + return status; + } + } + } } } - } else if (comp.genre() == typeInfo::Component::Genre::Data && - comp.derivedType()) { - // Handle nested derived types. - const typeInfo::DerivedType &compType{*comp.derivedType()}; - SubscriptValue extents[maxRank]; - GetComponentExtents(extents, comp, orig); - // Data components don't have descriptors, allocate them. - StaticDescriptor origStaticDesc; - StaticDescriptor cloneStaticDesc; - Descriptor &origDesc{origStaticDesc.descriptor()}; - Descriptor &cloneDesc{cloneStaticDesc.descriptor()}; - // Initialize each element. - for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) { + Advance(); + } else if (component_->genre() == typeInfo::Component::Genre::Data) { + if (component_->derivedType()) { + // Handle nested derived types. + const typeInfo::DerivedType &compType{*component_->derivedType()}; + SubscriptValue extents[maxRank]; + GetComponentExtents(extents, *component_, instance_); + Descriptor &origDesc{componentDescriptor_.descriptor()}; + Descriptor &cloneDesc{cloneComponentDescriptor_.descriptor()}; origDesc.Establish(compType, - orig.ElementComponent(at, comp.offset()), comp.rank(), - extents); + instance_.ElementComponent(subscripts_, component_->offset()), + component_->rank(), extents); cloneDesc.Establish(compType, - clone.ElementComponent(at, comp.offset()), comp.rank(), - extents); - stat = InitializeClone( - cloneDesc, origDesc, compType, terminator, hasStat, errMsg); - if (stat != StatOk) { - break; + clone_.ElementComponent(subscripts_, component_->offset()), + component_->rank(), extents); + Advance(); + if (int status{workQueue.BeginInitializeClone( + cloneDesc, origDesc, compType, hasStat_, errMsg_)}; + status != StatOk) { + return status; } + } else { + SkipToNextComponent(); } + } else { + SkipToNextComponent(); + } + } + return StatOk; +} + +// Fortran 2018 subclause 7.5.6.2 +RT_API_ATTRS void Finalize(const Descriptor &descriptor, + const typeInfo::DerivedType &derived, Terminator *terminator) { + if (!derived.noFinalizationNeeded() && descriptor.IsAllocated()) { + Terminator stubTerminator{"Finalize() in Fortran runtime", 0}; + WorkQueue workQueue{terminator ? *terminator : stubTerminator}; + if (workQueue.BeginFinalize(descriptor, derived) == StatContinue) { + workQueue.Run(); } } - return stat; } static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal( @@ -221,7 +235,7 @@ static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal( } static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor, - const typeInfo::DerivedType &derived, Terminator *terminator) { + const typeInfo::DerivedType &derived, Terminator &terminator) { if (const auto *special{FindFinal(derived, descriptor.rank())}) { if (special->which() == typeInfo::SpecialBinding::Which::ElementalFinal) { std::size_t elements{descriptor.Elements()}; @@ -258,9 +272,7 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor, copy = descriptor; copy.set_base_addr(nullptr); copy.raw().attribute = CFI_attribute_allocatable; - Terminator stubTerminator{"CallFinalProcedure() in Fortran runtime", 0}; - RUNTIME_CHECK(terminator ? *terminator : stubTerminator, - copy.Allocate(kNoAsyncObject) == CFI_SUCCESS); + RUNTIME_CHECK(terminator, copy.Allocate(kNoAsyncObject) == CFI_SUCCESS); ShallowCopyDiscontiguousToContiguous(copy, descriptor); argDescriptor = © } @@ -284,87 +296,94 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor, } } -// Fortran 2018 subclause 7.5.6.2 -RT_API_ATTRS void Finalize(const Descriptor &descriptor, - const typeInfo::DerivedType &derived, Terminator *terminator) { - if (derived.noFinalizationNeeded() || !descriptor.IsAllocated()) { - return; - } - CallFinalSubroutine(descriptor, derived, terminator); - const auto *parentType{derived.GetParentType()}; - bool recurse{parentType && !parentType->noFinalizationNeeded()}; +RT_API_ATTRS int FinalizeTicket::Begin(WorkQueue &workQueue) { + CallFinalSubroutine(instance_, derived_, workQueue.terminator()); // If there's a finalizable parent component, handle it last, as required // by the Fortran standard (7.5.6.2), and do so recursively with the same // descriptor so that the rank is preserved. - const Descriptor &componentDesc{derived.component()}; - std::size_t myComponents{componentDesc.Elements()}; - std::size_t elements{descriptor.Elements()}; - for (auto k{recurse ? std::size_t{1} - /* skip first component, it's the parent */ - : 0}; - k < myComponents; ++k) { - const auto &comp{ - *componentDesc.ZeroBasedIndexedElement(k)}; - SubscriptValue at[maxRank]; - descriptor.GetLowerBounds(at); - if (comp.genre() == typeInfo::Component::Genre::Allocatable && - comp.category() == TypeCategory::Derived) { + finalizableParentType_ = derived_.GetParentType(); + if (finalizableParentType_) { + if (finalizableParentType_->noFinalizationNeeded()) { + finalizableParentType_ = nullptr; + } else { + SkipToNextComponent(); + } + } + return StatContinue; +} + +RT_API_ATTRS int FinalizeTicket::Continue(WorkQueue &workQueue) { + while (!IsComplete()) { + if (component_->genre() == typeInfo::Component::Genre::Allocatable && + component_->category() == TypeCategory::Derived) { // Component may be polymorphic or unlimited polymorphic. Need to use the // dynamic type to check whether finalization is needed. - for (std::size_t j{0}; j++ < elements; - descriptor.IncrementSubscripts(at)) { - const Descriptor &compDesc{ - *descriptor.ElementComponent(at, comp.offset())}; - if (compDesc.IsAllocated()) { - if (const DescriptorAddendum * addendum{compDesc.Addendum()}) { - if (const typeInfo::DerivedType * - compDynamicType{addendum->derivedType()}) { - if (!compDynamicType->noFinalizationNeeded()) { - Finalize(compDesc, *compDynamicType, terminator); + const Descriptor &compDesc{*instance_.ElementComponent( + subscripts_, component_->offset())}; + Advance(); + if (compDesc.IsAllocated()) { + if (const DescriptorAddendum *addendum{compDesc.Addendum()}) { + if (const typeInfo::DerivedType *compDynamicType{ + addendum->derivedType()}) { + if (!compDynamicType->noFinalizationNeeded()) { + if (int status{ + workQueue.BeginFinalize(compDesc, *compDynamicType)}; + status != StatOk) { + return status; } } } } } - } else if (comp.genre() == typeInfo::Component::Genre::Allocatable || - comp.genre() == typeInfo::Component::Genre::Automatic) { - if (const typeInfo::DerivedType * compType{comp.derivedType()}) { - if (!compType->noFinalizationNeeded()) { - for (std::size_t j{0}; j++ < elements; - descriptor.IncrementSubscripts(at)) { - const Descriptor &compDesc{ - *descriptor.ElementComponent(at, comp.offset())}; - if (compDesc.IsAllocated()) { - Finalize(compDesc, *compType, terminator); - } + } else if (component_->genre() == typeInfo::Component::Genre::Allocatable || + component_->genre() == typeInfo::Component::Genre::Automatic) { + if (const typeInfo::DerivedType *compType{component_->derivedType()}; + compType && !compType->noFinalizationNeeded()) { + const Descriptor &compDesc{*instance_.ElementComponent( + subscripts_, component_->offset())}; + Advance(); + if (compDesc.IsAllocated()) { + if (int status{workQueue.BeginFinalize(compDesc, *compType)}; + status != StatOk) { + return status; } } + } else { + SkipToNextComponent(); } - } else if (comp.genre() == typeInfo::Component::Genre::Data && - comp.derivedType() && !comp.derivedType()->noFinalizationNeeded()) { + } else if (component_->genre() == typeInfo::Component::Genre::Data && + component_->derivedType() && + !component_->derivedType()->noFinalizationNeeded()) { SubscriptValue extents[maxRank]; - GetComponentExtents(extents, comp, descriptor); - StaticDescriptor staticDescriptor; - Descriptor &compDesc{staticDescriptor.descriptor()}; - const typeInfo::DerivedType &compType{*comp.derivedType()}; - for (std::size_t j{0}; j++ < elements; - descriptor.IncrementSubscripts(at)) { - compDesc.Establish(compType, - descriptor.ElementComponent(at, comp.offset()), comp.rank(), - extents); - Finalize(compDesc, compType, terminator); + GetComponentExtents(extents, *component_, instance_); + Descriptor &compDesc{componentDescriptor_.descriptor()}; + const typeInfo::DerivedType &compType{*component_->derivedType()}; + compDesc.Establish(compType, + instance_.ElementComponent(subscripts_, component_->offset()), + component_->rank(), extents); + Advance(); + if (int status{workQueue.BeginFinalize(compDesc, compType)}; + status != StatOk) { + return status; } + } else { + SkipToNextComponent(); } } - if (recurse) { - StaticDescriptor statDesc; - Descriptor &tmpDesc{statDesc.descriptor()}; - tmpDesc = descriptor; + // Last, do the parent component, if any and finalizable. + if (finalizableParentType_) { + Descriptor &tmpDesc{componentDescriptor_.descriptor()}; + tmpDesc = instance_; tmpDesc.raw().attribute = CFI_attribute_pointer; - tmpDesc.Addendum()->set_derivedType(parentType); - tmpDesc.raw().elem_len = parentType->sizeInBytes(); - Finalize(tmpDesc, *parentType, terminator); + tmpDesc.Addendum()->set_derivedType(finalizableParentType_); + tmpDesc.raw().elem_len = finalizableParentType_->sizeInBytes(); + const auto &parentType{*finalizableParentType_}; + finalizableParentType_ = nullptr; + // Don't return StatOk here if the nested FInalize is still running; + // it needs this->componentDescriptor_. + return workQueue.BeginFinalize(tmpDesc, parentType); } + return StatOk; } // The order of finalization follows Fortran 2018 7.5.6.2, with @@ -373,51 +392,71 @@ RT_API_ATTRS void Finalize(const Descriptor &descriptor, // preceding any deallocation. RT_API_ATTRS void Destroy(const Descriptor &descriptor, bool finalize, const typeInfo::DerivedType &derived, Terminator *terminator) { - if (derived.noDestructionNeeded() || !descriptor.IsAllocated()) { - return; + if (!derived.noFinalizationNeeded() && descriptor.IsAllocated()) { + Terminator stubTerminator{"Destroy() in Fortran runtime", 0}; + WorkQueue workQueue{terminator ? *terminator : stubTerminator}; + if (workQueue.BeginDestroy(descriptor, derived, finalize) == StatContinue) { + workQueue.Run(); + } } - if (finalize && !derived.noFinalizationNeeded()) { - Finalize(descriptor, derived, terminator); +} + +RT_API_ATTRS int DestroyTicket::Begin(WorkQueue &workQueue) { + if (finalize_ && !derived_.noFinalizationNeeded()) { + if (int status{workQueue.BeginFinalize(instance_, derived_)}; + status != StatOk && status != StatContinue) { + return status; + } } + return StatContinue; +} + +RT_API_ATTRS int DestroyTicket::Continue(WorkQueue &workQueue) { // Deallocate all direct and indirect allocatable and automatic components. // Contrary to finalization, the order of deallocation does not matter. - const Descriptor &componentDesc{derived.component()}; - std::size_t myComponents{componentDesc.Elements()}; - std::size_t elements{descriptor.Elements()}; - SubscriptValue at[maxRank]; - descriptor.GetLowerBounds(at); - for (std::size_t k{0}; k < myComponents; ++k) { - const auto &comp{ - *componentDesc.ZeroBasedIndexedElement(k)}; - const bool destroyComp{ - comp.derivedType() && !comp.derivedType()->noDestructionNeeded()}; - if (comp.genre() == typeInfo::Component::Genre::Allocatable || - comp.genre() == typeInfo::Component::Genre::Automatic) { - for (std::size_t j{0}; j < elements; ++j) { - Descriptor *d{ - descriptor.ElementComponent(at, comp.offset())}; - if (destroyComp) { - Destroy(*d, /*finalize=*/false, *comp.derivedType(), terminator); + while (!IsComplete()) { + const auto *componentDerived{component_->derivedType()}; + if (component_->genre() == typeInfo::Component::Genre::Allocatable || + component_->genre() == typeInfo::Component::Genre::Automatic) { + Descriptor *d{instance_.ElementComponent( + subscripts_, component_->offset())}; + if (d->IsAllocated()) { + if (phase_ == 0) { + ++phase_; + if (componentDerived && !componentDerived->noDestructionNeeded()) { + if (int status{workQueue.BeginDestroy( + *d, *componentDerived, /*finalize=*/false)}; + status != StatOk) { + return status; + } + } } d->Deallocate(); - descriptor.IncrementSubscripts(at); } - } else if (destroyComp && - comp.genre() == typeInfo::Component::Genre::Data) { - SubscriptValue extents[maxRank]; - GetComponentExtents(extents, comp, descriptor); - StaticDescriptor staticDescriptor; - Descriptor &compDesc{staticDescriptor.descriptor()}; - const typeInfo::DerivedType &compType{*comp.derivedType()}; - for (std::size_t j{0}; j++ < elements; - descriptor.IncrementSubscripts(at)) { + Advance(); + } else if (component_->genre() == typeInfo::Component::Genre::Data) { + if (!componentDerived || componentDerived->noDestructionNeeded()) { + SkipToNextComponent(); + } else { + SubscriptValue extents[maxRank]; + GetComponentExtents(extents, *component_, instance_); + Descriptor &compDesc{componentDescriptor_.descriptor()}; + const typeInfo::DerivedType &compType{*componentDerived}; compDesc.Establish(compType, - descriptor.ElementComponent(at, comp.offset()), comp.rank(), - extents); - Destroy(compDesc, /*finalize=*/false, *comp.derivedType(), terminator); + instance_.ElementComponent(subscripts_, component_->offset()), + component_->rank(), extents); + Advance(); + if (int status{workQueue.BeginDestroy( + compDesc, *componentDerived, /*finalize=*/false)}; + status != StatOk) { + return status; + } } + } else { + SkipToNextComponent(); } } + return StatOk; } RT_API_ATTRS bool HasDynamicComponent(const Descriptor &descriptor) { diff --git a/flang-rt/lib/runtime/descriptor-io.cpp b/flang-rt/lib/runtime/descriptor-io.cpp index 3db1455af52fe..364724b89ba0d 100644 --- a/flang-rt/lib/runtime/descriptor-io.cpp +++ b/flang-rt/lib/runtime/descriptor-io.cpp @@ -7,15 +7,44 @@ //===----------------------------------------------------------------------===// #include "descriptor-io.h" +#include "edit-input.h" +#include "edit-output.h" +#include "unit.h" +#include "flang-rt/runtime/descriptor.h" +#include "flang-rt/runtime/io-stmt.h" +#include "flang-rt/runtime/namelist.h" +#include "flang-rt/runtime/terminator.h" +#include "flang-rt/runtime/type-info.h" +#include "flang-rt/runtime/work-queue.h" +#include "flang/Common/optional.h" #include "flang/Common/restorer.h" +#include "flang/Common/uint128.h" +#include "flang/Runtime/cpp-type.h" #include "flang/Runtime/freestanding-tools.h" +// Implementation of I/O data list item transfers based on descriptors. +// (All I/O items come through here so that the code is exercised for test; +// some scalar I/O data transfer APIs could be changed to bypass their use +// of descriptors in the future for better efficiency.) + namespace Fortran::runtime::io::descr { RT_OFFLOAD_API_GROUP_BEGIN +template +inline RT_API_ATTRS A &ExtractElement(IoStatementState &io, + const Descriptor &descriptor, const SubscriptValue subscripts[]) { + A *p{descriptor.Element(subscripts)}; + if (!p) { + io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base " + "address or subscripts out of range"); + } + return *p; +} + // Defined formatted I/O (maybe) -Fortran::common::optional DefinedFormattedIo(IoStatementState &io, - const Descriptor &descriptor, const typeInfo::DerivedType &derived, +static RT_API_ATTRS Fortran::common::optional DefinedFormattedIo( + IoStatementState &io, const Descriptor &descriptor, + const typeInfo::DerivedType &derived, const typeInfo::SpecialBinding &special, const SubscriptValue subscripts[]) { Fortran::common::optional peek{ @@ -104,8 +133,8 @@ Fortran::common::optional DefinedFormattedIo(IoStatementState &io, } // Defined unformatted I/O -bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor, - const typeInfo::DerivedType &derived, +static RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &io, + const Descriptor &descriptor, const typeInfo::DerivedType &derived, const typeInfo::SpecialBinding &special) { // Unformatted I/O must have an external unit (or child thereof). IoErrorHandler &handler{io.GetIoErrorHandler()}; @@ -152,5 +181,619 @@ bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor, return handler.GetIoStat() == IostatOk; } +// Per-category descriptor-based I/O templates + +// TODO (perhaps as a nontrivial but small starter project): implement +// automatic repetition counts, like "10*3.14159", for list-directed and +// NAMELIST array output. + +template +inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io, + const Descriptor &descriptor, [[maybe_unused]] bool isSigned) { + std::size_t numElements{descriptor.Elements()}; + SubscriptValue subscripts[maxRank]; + descriptor.GetLowerBounds(subscripts); + using IntType = CppTypeFor; + bool anyInput{false}; + for (std::size_t j{0}; j < numElements; ++j) { + if (auto edit{io.GetNextDataEdit()}) { + IntType &x{ExtractElement(io, descriptor, subscripts)}; + if constexpr (DIR == Direction::Output) { + if (!EditIntegerOutput(io, *edit, x, isSigned)) { + return false; + } + } else if (edit->descriptor != DataEdit::ListDirectedNullValue) { + if (EditIntegerInput( + io, *edit, reinterpret_cast(&x), KIND, isSigned)) { + anyInput = true; + } else { + return anyInput && edit->IsNamelist(); + } + } + if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { + io.GetIoErrorHandler().Crash( + "FormattedIntegerIO: subscripts out of bounds"); + } + } else { + return false; + } + } + return true; +} + +template +inline RT_API_ATTRS bool FormattedRealIO( + IoStatementState &io, const Descriptor &descriptor) { + std::size_t numElements{descriptor.Elements()}; + SubscriptValue subscripts[maxRank]; + descriptor.GetLowerBounds(subscripts); + using RawType = typename RealOutputEditing::BinaryFloatingPoint; + bool anyInput{false}; + for (std::size_t j{0}; j < numElements; ++j) { + if (auto edit{io.GetNextDataEdit()}) { + RawType &x{ExtractElement(io, descriptor, subscripts)}; + if constexpr (DIR == Direction::Output) { + if (!RealOutputEditing{io, x}.Edit(*edit)) { + return false; + } + } else if (edit->descriptor != DataEdit::ListDirectedNullValue) { + if (EditRealInput(io, *edit, reinterpret_cast(&x))) { + anyInput = true; + } else { + return anyInput && edit->IsNamelist(); + } + } + if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { + io.GetIoErrorHandler().Crash( + "FormattedRealIO: subscripts out of bounds"); + } + } else { + return false; + } + } + return true; +} + +template +inline RT_API_ATTRS bool FormattedComplexIO( + IoStatementState &io, const Descriptor &descriptor) { + std::size_t numElements{descriptor.Elements()}; + SubscriptValue subscripts[maxRank]; + descriptor.GetLowerBounds(subscripts); + bool isListOutput{ + io.get_if>() != nullptr}; + using RawType = typename RealOutputEditing::BinaryFloatingPoint; + bool anyInput{false}; + for (std::size_t j{0}; j < numElements; ++j) { + RawType *x{&ExtractElement(io, descriptor, subscripts)}; + if (isListOutput) { + DataEdit rEdit, iEdit; + rEdit.descriptor = DataEdit::ListDirectedRealPart; + iEdit.descriptor = DataEdit::ListDirectedImaginaryPart; + rEdit.modes = iEdit.modes = io.mutableModes(); + if (!RealOutputEditing{io, x[0]}.Edit(rEdit) || + !RealOutputEditing{io, x[1]}.Edit(iEdit)) { + return false; + } + } else { + for (int k{0}; k < 2; ++k, ++x) { + auto edit{io.GetNextDataEdit()}; + if (!edit) { + return false; + } else if constexpr (DIR == Direction::Output) { + if (!RealOutputEditing{io, *x}.Edit(*edit)) { + return false; + } + } else if (edit->descriptor == DataEdit::ListDirectedNullValue) { + break; + } else if (EditRealInput( + io, *edit, reinterpret_cast(x))) { + anyInput = true; + } else { + return anyInput && edit->IsNamelist(); + } + } + } + if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { + io.GetIoErrorHandler().Crash( + "FormattedComplexIO: subscripts out of bounds"); + } + } + return true; +} + +template +inline RT_API_ATTRS bool FormattedCharacterIO( + IoStatementState &io, const Descriptor &descriptor) { + std::size_t numElements{descriptor.Elements()}; + SubscriptValue subscripts[maxRank]; + descriptor.GetLowerBounds(subscripts); + std::size_t length{descriptor.ElementBytes() / sizeof(A)}; + auto *listOutput{io.get_if>()}; + bool anyInput{false}; + for (std::size_t j{0}; j < numElements; ++j) { + A *x{&ExtractElement(io, descriptor, subscripts)}; + if (listOutput) { + if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) { + return false; + } + } else if (auto edit{io.GetNextDataEdit()}) { + if constexpr (DIR == Direction::Output) { + if (!EditCharacterOutput(io, *edit, x, length)) { + return false; + } + } else { // input + if (edit->descriptor != DataEdit::ListDirectedNullValue) { + if (EditCharacterInput(io, *edit, x, length)) { + anyInput = true; + } else { + return anyInput && edit->IsNamelist(); + } + } + } + } else { + return false; + } + if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { + io.GetIoErrorHandler().Crash( + "FormattedCharacterIO: subscripts out of bounds"); + } + } + return true; +} + +template +inline RT_API_ATTRS bool FormattedLogicalIO( + IoStatementState &io, const Descriptor &descriptor) { + std::size_t numElements{descriptor.Elements()}; + SubscriptValue subscripts[maxRank]; + descriptor.GetLowerBounds(subscripts); + auto *listOutput{io.get_if>()}; + using IntType = CppTypeFor; + bool anyInput{false}; + for (std::size_t j{0}; j < numElements; ++j) { + IntType &x{ExtractElement(io, descriptor, subscripts)}; + if (listOutput) { + if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) { + return false; + } + } else if (auto edit{io.GetNextDataEdit()}) { + if constexpr (DIR == Direction::Output) { + if (!EditLogicalOutput(io, *edit, x != 0)) { + return false; + } + } else { + if (edit->descriptor != DataEdit::ListDirectedNullValue) { + bool truth{}; + if (EditLogicalInput(io, *edit, truth)) { + x = truth; + anyInput = true; + } else { + return anyInput && edit->IsNamelist(); + } + } + } + } else { + return false; + } + if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { + io.GetIoErrorHandler().Crash( + "FormattedLogicalIO: subscripts out of bounds"); + } + } + return true; +} + +template +RT_API_ATTRS int DerivedIoTicket::Continue(WorkQueue &workQueue) { + while (!IsComplete()) { + if (component_->genre() == typeInfo::Component::Genre::Data) { + // Create a descriptor for the component + Descriptor &compDesc{componentDescriptor_.descriptor()}; + component_->CreatePointerDescriptor( + compDesc, instance_, io_.GetIoErrorHandler(), subscripts_); + Advance(); + if (int status{workQueue.BeginDescriptorIo( + io_, compDesc, table_, anyIoTookPlace_)}; + status != StatOk) { + return status; + } + } else { + // Component is itself a descriptor + char *pointer{ + instance_.Element(subscripts_) + component_->offset()}; + const Descriptor &compDesc{ + *reinterpret_cast(pointer)}; + Advance(); + if (compDesc.IsAllocated()) { + if (int status{workQueue.BeginDescriptorIo( + io_, compDesc, table_, anyIoTookPlace_)}; + status != StatOk) { + return status; + } + } + } + } + return StatOk; +} + +template RT_API_ATTRS int DerivedIoTicket::Continue( + WorkQueue &); +template RT_API_ATTRS int DerivedIoTicket::Continue( + WorkQueue &); + +template +RT_API_ATTRS int DescriptorIoTicket::Begin(WorkQueue &workQueue) { + IoErrorHandler &handler{io_.GetIoErrorHandler()}; + if (handler.InError()) { + return handler.GetIoStat(); + } + if (!io_.get_if>()) { + handler.Crash("DescriptorIO() called for wrong I/O direction"); + return handler.GetIoStat(); + } + if constexpr (DIR == Direction::Input) { + if (!io_.BeginReadingRecord()) { + return StatOk; + } + } + if (!io_.get_if>()) { + // Unformatted I/O + IoErrorHandler &handler{io_.GetIoErrorHandler()}; + const DescriptorAddendum *addendum{instance_.Addendum()}; + if (const typeInfo::DerivedType *type{ + addendum ? addendum->derivedType() : nullptr}) { + // derived type unformatted I/O + if (table_) { + if (const auto *definedIo{table_->Find(*type, + DIR == Direction::Input + ? common::DefinedIo::ReadUnformatted + : common::DefinedIo::WriteUnformatted)}) { + if (definedIo->subroutine) { + typeInfo::SpecialBinding special{DIR == Direction::Input + ? typeInfo::SpecialBinding::Which::ReadUnformatted + : typeInfo::SpecialBinding::Which::WriteUnformatted, + definedIo->subroutine, definedIo->isDtvArgPolymorphic, false, + false}; + if (DefinedUnformattedIo(io_, instance_, *type, special)) { + anyIoTookPlace_ = true; + return StatOk; + } + } else { + int status{workQueue.BeginDerivedIo( + io_, instance_, *type, table_, anyIoTookPlace_)}; + return status == StatContinue ? StatOk : status; // done here + } + } + } + if (const typeInfo::SpecialBinding *special{ + type->FindSpecialBinding(DIR == Direction::Input + ? typeInfo::SpecialBinding::Which::ReadUnformatted + : typeInfo::SpecialBinding::Which::WriteUnformatted)}) { + if (!table_ || !table_->ignoreNonTbpEntries || special->isTypeBound()) { + // defined derived type unformatted I/O + if (DefinedUnformattedIo(io_, instance_, *type, *special)) { + anyIoTookPlace_ = true; + return StatOk; + } else { + return IostatEnd; + } + } + } + // Default derived type unformatted I/O + // TODO: If no component at any level has defined READ or WRITE + // (as appropriate), the elements are contiguous, and no byte swapping + // is active, do a block transfer via the code below. + int status{workQueue.BeginDerivedIo( + io_, instance_, *type, table_, anyIoTookPlace_)}; + return status == StatContinue ? StatOk : status; // done here + } else { + // intrinsic type unformatted I/O + auto *externalUnf{io_.get_if>()}; + ChildUnformattedIoStatementState *childUnf{nullptr}; + InquireIOLengthState *inq{nullptr}; + bool swapEndianness{false}; + if (externalUnf) { + swapEndianness = externalUnf->unit().swapEndianness(); + } else { + childUnf = io_.get_if>(); + if (!childUnf) { + inq = DIR == Direction::Output ? io_.get_if() + : nullptr; + RUNTIME_CHECK(handler, inq != nullptr); + } + } + std::size_t elementBytes{instance_.ElementBytes()}; + std::size_t swappingBytes{elementBytes}; + if (auto maybeCatAndKind{instance_.type().GetCategoryAndKind()}) { + // Byte swapping units can be smaller than elements, namely + // for COMPLEX and CHARACTER. + if (maybeCatAndKind->first == TypeCategory::Character) { + // swap each character position independently + swappingBytes = maybeCatAndKind->second; // kind + } else if (maybeCatAndKind->first == TypeCategory::Complex) { + // swap real and imaginary components independently + swappingBytes /= 2; + } + } + using CharType = + std::conditional_t; + auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool { + if constexpr (DIR == Direction::Output) { + return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes) + : childUnf ? childUnf->Emit(&x, totalBytes, swappingBytes) + : inq->Emit(&x, totalBytes, swappingBytes); + } else { + return externalUnf + ? externalUnf->Receive(&x, totalBytes, swappingBytes) + : childUnf->Receive(&x, totalBytes, swappingBytes); + } + }}; + if (!swapEndianness && + instance_.IsContiguous()) { // contiguous unformatted I/O + char &x{ExtractElement(io_, instance_, subscripts_)}; + if (Transfer(x, elements_ * elementBytes)) { + anyIoTookPlace_ = true; + } else { + return IostatEnd; + } + } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O + for (; !IsComplete(); Advance()) { + char &x{ExtractElement(io_, instance_, subscripts_)}; + if (Transfer(x, elementBytes)) { + anyIoTookPlace_ = true; + } else { + return IostatEnd; + } + } + } + } + // Unformatted I/O never needs to call Continue(). + return StatOk; + } + // Formatted I/O + if (auto catAndKind{instance_.type().GetCategoryAndKind()}) { + TypeCategory cat{catAndKind->first}; + int kind{catAndKind->second}; + bool any{false}; + switch (cat) { + case TypeCategory::Integer: + switch (kind) { + case 1: + any = FormattedIntegerIO<1, DIR>(io_, instance_, true); + break; + case 2: + any = FormattedIntegerIO<2, DIR>(io_, instance_, true); + break; + case 4: + any = FormattedIntegerIO<4, DIR>(io_, instance_, true); + break; + case 8: + any = FormattedIntegerIO<8, DIR>(io_, instance_, true); + break; + case 16: + any = FormattedIntegerIO<16, DIR>(io_, instance_, true); + break; + default: + handler.Crash( + "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind); + return IostatEnd; + } + break; + case TypeCategory::Unsigned: + switch (kind) { + case 1: + any = FormattedIntegerIO<1, DIR>(io_, instance_, false); + break; + case 2: + any = FormattedIntegerIO<2, DIR>(io_, instance_, false); + break; + case 4: + any = FormattedIntegerIO<4, DIR>(io_, instance_, false); + break; + case 8: + any = FormattedIntegerIO<8, DIR>(io_, instance_, false); + break; + case 16: + any = FormattedIntegerIO<16, DIR>(io_, instance_, false); + break; + default: + handler.Crash( + "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind); + return IostatEnd; + } + break; + case TypeCategory::Real: + switch (kind) { + case 2: + any = FormattedRealIO<2, DIR>(io_, instance_); + break; + case 3: + any = FormattedRealIO<3, DIR>(io_, instance_); + break; + case 4: + any = FormattedRealIO<4, DIR>(io_, instance_); + break; + case 8: + any = FormattedRealIO<8, DIR>(io_, instance_); + break; + case 10: + any = FormattedRealIO<10, DIR>(io_, instance_); + break; + // TODO: case double/double + case 16: + any = FormattedRealIO<16, DIR>(io_, instance_); + break; + default: + handler.Crash( + "not yet implemented: REAL(KIND=%d) in formatted IO", kind); + return IostatEnd; + } + break; + case TypeCategory::Complex: + switch (kind) { + case 2: + any = FormattedComplexIO<2, DIR>(io_, instance_); + break; + case 3: + any = FormattedComplexIO<3, DIR>(io_, instance_); + break; + case 4: + any = FormattedComplexIO<4, DIR>(io_, instance_); + break; + case 8: + any = FormattedComplexIO<8, DIR>(io_, instance_); + break; + case 10: + any = FormattedComplexIO<10, DIR>(io_, instance_); + break; + // TODO: case double/double + case 16: + any = FormattedComplexIO<16, DIR>(io_, instance_); + break; + default: + handler.Crash( + "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind); + return IostatEnd; + } + break; + case TypeCategory::Character: + switch (kind) { + case 1: + any = FormattedCharacterIO(io_, instance_); + break; + case 2: + any = FormattedCharacterIO(io_, instance_); + break; + case 4: + any = FormattedCharacterIO(io_, instance_); + break; + default: + handler.Crash( + "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind); + return IostatEnd; + } + break; + case TypeCategory::Logical: + switch (kind) { + case 1: + any = FormattedLogicalIO<1, DIR>(io_, instance_); + break; + case 2: + any = FormattedLogicalIO<2, DIR>(io_, instance_); + break; + case 4: + any = FormattedLogicalIO<4, DIR>(io_, instance_); + break; + case 8: + any = FormattedLogicalIO<8, DIR>(io_, instance_); + break; + default: + handler.Crash( + "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind); + return IostatEnd; + } + break; + case TypeCategory::Derived: { + // Derived type information must be present for formatted I/O. + IoErrorHandler &handler{io_.GetIoErrorHandler()}; + const DescriptorAddendum *addendum{instance_.Addendum()}; + RUNTIME_CHECK(handler, addendum != nullptr); + derived_ = addendum->derivedType(); + RUNTIME_CHECK(handler, derived_ != nullptr); + if (table_) { + if (const auto *definedIo{table_->Find(*derived_, + DIR == Direction::Input ? common::DefinedIo::ReadFormatted + : common::DefinedIo::WriteFormatted)}) { + if (definedIo->subroutine) { + nonTbpSpecial_.emplace(DIR == Direction::Input + ? typeInfo::SpecialBinding::Which::ReadFormatted + : typeInfo::SpecialBinding::Which::WriteFormatted, + definedIo->subroutine, definedIo->isDtvArgPolymorphic, false, + false); + special_ = &*nonTbpSpecial_; + } + } + } + if (!special_) { + if (const typeInfo::SpecialBinding *binding{ + derived_->FindSpecialBinding(DIR == Direction::Input + ? typeInfo::SpecialBinding::Which::ReadFormatted + : typeInfo::SpecialBinding::Which::WriteFormatted)}) { + if (!table_ || !table_->ignoreNonTbpEntries || + binding->isTypeBound()) { + special_ = binding; + } + } + } + return StatContinue; + } + } + if (any) { + anyIoTookPlace_ = true; + } else { + return IostatEnd; + } + } else { + handler.Crash("DescriptorIO: bad type code (%d) in descriptor", + static_cast(instance_.type().raw())); + return handler.GetIoStat(); + } + return StatOk; +} + +template RT_API_ATTRS int DescriptorIoTicket::Begin( + WorkQueue &); +template RT_API_ATTRS int DescriptorIoTicket::Begin( + WorkQueue &); + +template +RT_API_ATTRS int DescriptorIoTicket::Continue(WorkQueue &workQueue) { + // Only derived type formatted I/O gets here. + while (!IsComplete()) { + if (special_) { + if (auto defined{DefinedFormattedIo( + io_, instance_, *derived_, *special_, subscripts_)}) { + anyIoTookPlace_ |= *defined; + Advance(); + continue; + } + } + Descriptor &elementDesc{elementDescriptor_.descriptor()}; + elementDesc.Establish( + *derived_, nullptr, 0, nullptr, CFI_attribute_pointer); + elementDesc.set_base_addr(instance_.Element(subscripts_)); + Advance(); + if (int status{workQueue.BeginDerivedIo( + io_, elementDesc, *derived_, table_, anyIoTookPlace_)}; + status != StatOk) { + return status; + } + } + return StatOk; +} + +template RT_API_ATTRS int DescriptorIoTicket::Continue( + WorkQueue &); +template RT_API_ATTRS int DescriptorIoTicket::Continue( + WorkQueue &); + +template +RT_API_ATTRS bool DescriptorIO(IoStatementState &io, + const Descriptor &descriptor, const NonTbpDefinedIoTable *table) { + bool anyIoTookPlace{false}; + WorkQueue workQueue{io.GetIoErrorHandler()}; + if (workQueue.BeginDescriptorIo(io, descriptor, table, anyIoTookPlace) == + StatContinue) { + workQueue.Run(); + } + return anyIoTookPlace; +} + +template RT_API_ATTRS bool DescriptorIO( + IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *); +template RT_API_ATTRS bool DescriptorIO( + IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *); + RT_OFFLOAD_API_GROUP_END } // namespace Fortran::runtime::io::descr diff --git a/flang-rt/lib/runtime/descriptor-io.h b/flang-rt/lib/runtime/descriptor-io.h index eb60f106c9203..88ad59bd24b53 100644 --- a/flang-rt/lib/runtime/descriptor-io.h +++ b/flang-rt/lib/runtime/descriptor-io.h @@ -9,619 +9,27 @@ #ifndef FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_ #define FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_ -// Implementation of I/O data list item transfers based on descriptors. -// (All I/O items come through here so that the code is exercised for test; -// some scalar I/O data transfer APIs could be changed to bypass their use -// of descriptors in the future for better efficiency.) +#include "flang-rt/runtime/connection.h" -#include "edit-input.h" -#include "edit-output.h" -#include "unit.h" -#include "flang-rt/runtime/descriptor.h" -#include "flang-rt/runtime/io-stmt.h" -#include "flang-rt/runtime/namelist.h" -#include "flang-rt/runtime/terminator.h" -#include "flang-rt/runtime/type-info.h" -#include "flang/Common/optional.h" -#include "flang/Common/uint128.h" -#include "flang/Runtime/cpp-type.h" +namespace Fortran::runtime { +class Descriptor; +} // namespace Fortran::runtime -namespace Fortran::runtime::io::descr { -template -inline RT_API_ATTRS A &ExtractElement(IoStatementState &io, - const Descriptor &descriptor, const SubscriptValue subscripts[]) { - A *p{descriptor.Element(subscripts)}; - if (!p) { - io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base " - "address or subscripts out of range"); - } - return *p; -} - -// Per-category descriptor-based I/O templates - -// TODO (perhaps as a nontrivial but small starter project): implement -// automatic repetition counts, like "10*3.14159", for list-directed and -// NAMELIST array output. - -template -inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io, - const Descriptor &descriptor, [[maybe_unused]] bool isSigned) { - std::size_t numElements{descriptor.Elements()}; - SubscriptValue subscripts[maxRank]; - descriptor.GetLowerBounds(subscripts); - using IntType = CppTypeFor; - bool anyInput{false}; - for (std::size_t j{0}; j < numElements; ++j) { - if (auto edit{io.GetNextDataEdit()}) { - IntType &x{ExtractElement(io, descriptor, subscripts)}; - if constexpr (DIR == Direction::Output) { - if (!EditIntegerOutput(io, *edit, x, isSigned)) { - return false; - } - } else if (edit->descriptor != DataEdit::ListDirectedNullValue) { - if (EditIntegerInput( - io, *edit, reinterpret_cast(&x), KIND, isSigned)) { - anyInput = true; - } else { - return anyInput && edit->IsNamelist(); - } - } - if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { - io.GetIoErrorHandler().Crash( - "FormattedIntegerIO: subscripts out of bounds"); - } - } else { - return false; - } - } - return true; -} - -template -inline RT_API_ATTRS bool FormattedRealIO( - IoStatementState &io, const Descriptor &descriptor) { - std::size_t numElements{descriptor.Elements()}; - SubscriptValue subscripts[maxRank]; - descriptor.GetLowerBounds(subscripts); - using RawType = typename RealOutputEditing::BinaryFloatingPoint; - bool anyInput{false}; - for (std::size_t j{0}; j < numElements; ++j) { - if (auto edit{io.GetNextDataEdit()}) { - RawType &x{ExtractElement(io, descriptor, subscripts)}; - if constexpr (DIR == Direction::Output) { - if (!RealOutputEditing{io, x}.Edit(*edit)) { - return false; - } - } else if (edit->descriptor != DataEdit::ListDirectedNullValue) { - if (EditRealInput(io, *edit, reinterpret_cast(&x))) { - anyInput = true; - } else { - return anyInput && edit->IsNamelist(); - } - } - if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { - io.GetIoErrorHandler().Crash( - "FormattedRealIO: subscripts out of bounds"); - } - } else { - return false; - } - } - return true; -} +namespace Fortran::runtime::io { +class IoStatementState; +struct NonTbpDefinedIoTable; +} // namespace Fortran::runtime::io -template -inline RT_API_ATTRS bool FormattedComplexIO( - IoStatementState &io, const Descriptor &descriptor) { - std::size_t numElements{descriptor.Elements()}; - SubscriptValue subscripts[maxRank]; - descriptor.GetLowerBounds(subscripts); - bool isListOutput{ - io.get_if>() != nullptr}; - using RawType = typename RealOutputEditing::BinaryFloatingPoint; - bool anyInput{false}; - for (std::size_t j{0}; j < numElements; ++j) { - RawType *x{&ExtractElement(io, descriptor, subscripts)}; - if (isListOutput) { - DataEdit rEdit, iEdit; - rEdit.descriptor = DataEdit::ListDirectedRealPart; - iEdit.descriptor = DataEdit::ListDirectedImaginaryPart; - rEdit.modes = iEdit.modes = io.mutableModes(); - if (!RealOutputEditing{io, x[0]}.Edit(rEdit) || - !RealOutputEditing{io, x[1]}.Edit(iEdit)) { - return false; - } - } else { - for (int k{0}; k < 2; ++k, ++x) { - auto edit{io.GetNextDataEdit()}; - if (!edit) { - return false; - } else if constexpr (DIR == Direction::Output) { - if (!RealOutputEditing{io, *x}.Edit(*edit)) { - return false; - } - } else if (edit->descriptor == DataEdit::ListDirectedNullValue) { - break; - } else if (EditRealInput( - io, *edit, reinterpret_cast(x))) { - anyInput = true; - } else { - return anyInput && edit->IsNamelist(); - } - } - } - if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { - io.GetIoErrorHandler().Crash( - "FormattedComplexIO: subscripts out of bounds"); - } - } - return true; -} - -template -inline RT_API_ATTRS bool FormattedCharacterIO( - IoStatementState &io, const Descriptor &descriptor) { - std::size_t numElements{descriptor.Elements()}; - SubscriptValue subscripts[maxRank]; - descriptor.GetLowerBounds(subscripts); - std::size_t length{descriptor.ElementBytes() / sizeof(A)}; - auto *listOutput{io.get_if>()}; - bool anyInput{false}; - for (std::size_t j{0}; j < numElements; ++j) { - A *x{&ExtractElement(io, descriptor, subscripts)}; - if (listOutput) { - if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) { - return false; - } - } else if (auto edit{io.GetNextDataEdit()}) { - if constexpr (DIR == Direction::Output) { - if (!EditCharacterOutput(io, *edit, x, length)) { - return false; - } - } else { // input - if (edit->descriptor != DataEdit::ListDirectedNullValue) { - if (EditCharacterInput(io, *edit, x, length)) { - anyInput = true; - } else { - return anyInput && edit->IsNamelist(); - } - } - } - } else { - return false; - } - if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { - io.GetIoErrorHandler().Crash( - "FormattedCharacterIO: subscripts out of bounds"); - } - } - return true; -} - -template -inline RT_API_ATTRS bool FormattedLogicalIO( - IoStatementState &io, const Descriptor &descriptor) { - std::size_t numElements{descriptor.Elements()}; - SubscriptValue subscripts[maxRank]; - descriptor.GetLowerBounds(subscripts); - auto *listOutput{io.get_if>()}; - using IntType = CppTypeFor; - bool anyInput{false}; - for (std::size_t j{0}; j < numElements; ++j) { - IntType &x{ExtractElement(io, descriptor, subscripts)}; - if (listOutput) { - if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) { - return false; - } - } else if (auto edit{io.GetNextDataEdit()}) { - if constexpr (DIR == Direction::Output) { - if (!EditLogicalOutput(io, *edit, x != 0)) { - return false; - } - } else { - if (edit->descriptor != DataEdit::ListDirectedNullValue) { - bool truth{}; - if (EditLogicalInput(io, *edit, truth)) { - x = truth; - anyInput = true; - } else { - return anyInput && edit->IsNamelist(); - } - } - } - } else { - return false; - } - if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { - io.GetIoErrorHandler().Crash( - "FormattedLogicalIO: subscripts out of bounds"); - } - } - return true; -} +namespace Fortran::runtime::io::descr { template -static RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &, +RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable * = nullptr); -// For intrinsic (not defined) derived type I/O, formatted & unformatted -template -static RT_API_ATTRS bool DefaultComponentIO(IoStatementState &io, - const typeInfo::Component &component, const Descriptor &origDescriptor, - const SubscriptValue origSubscripts[], Terminator &terminator, - const NonTbpDefinedIoTable *table) { -#if !defined(RT_DEVICE_AVOID_RECURSION) - if (component.genre() == typeInfo::Component::Genre::Data) { - // Create a descriptor for the component - StaticDescriptor statDesc; - Descriptor &desc{statDesc.descriptor()}; - component.CreatePointerDescriptor( - desc, origDescriptor, terminator, origSubscripts); - return DescriptorIO(io, desc, table); - } else { - // Component is itself a descriptor - char *pointer{ - origDescriptor.Element(origSubscripts) + component.offset()}; - const Descriptor &compDesc{*reinterpret_cast(pointer)}; - return compDesc.IsAllocated() && DescriptorIO(io, compDesc, table); - } -#else - terminator.Crash("not yet implemented: component IO"); -#endif -} - -template -static RT_API_ATTRS bool DefaultComponentwiseFormattedIO(IoStatementState &io, - const Descriptor &descriptor, const typeInfo::DerivedType &type, - const NonTbpDefinedIoTable *table, const SubscriptValue subscripts[]) { - IoErrorHandler &handler{io.GetIoErrorHandler()}; - const Descriptor &compArray{type.component()}; - RUNTIME_CHECK(handler, compArray.rank() == 1); - std::size_t numComponents{compArray.Elements()}; - SubscriptValue at[maxRank]; - compArray.GetLowerBounds(at); - for (std::size_t k{0}; k < numComponents; - ++k, compArray.IncrementSubscripts(at)) { - const typeInfo::Component &component{ - *compArray.Element(at)}; - if (!DefaultComponentIO( - io, component, descriptor, subscripts, handler, table)) { - // Return true for NAMELIST input if any component appeared. - auto *listInput{ - io.get_if>()}; - return DIR == Direction::Input && k > 0 && listInput && - listInput->inNamelistSequence(); - } - } - return true; -} - -template -static RT_API_ATTRS bool DefaultComponentwiseUnformattedIO(IoStatementState &io, - const Descriptor &descriptor, const typeInfo::DerivedType &type, - const NonTbpDefinedIoTable *table) { - IoErrorHandler &handler{io.GetIoErrorHandler()}; - const Descriptor &compArray{type.component()}; - RUNTIME_CHECK(handler, compArray.rank() == 1); - std::size_t numComponents{compArray.Elements()}; - std::size_t numElements{descriptor.Elements()}; - SubscriptValue subscripts[maxRank]; - descriptor.GetLowerBounds(subscripts); - for (std::size_t j{0}; j < numElements; - ++j, descriptor.IncrementSubscripts(subscripts)) { - SubscriptValue at[maxRank]; - compArray.GetLowerBounds(at); - for (std::size_t k{0}; k < numComponents; - ++k, compArray.IncrementSubscripts(at)) { - const typeInfo::Component &component{ - *compArray.Element(at)}; - if (!DefaultComponentIO( - io, component, descriptor, subscripts, handler, table)) { - return false; - } - } - } - return true; -} - -RT_API_ATTRS Fortran::common::optional DefinedFormattedIo( - IoStatementState &, const Descriptor &, const typeInfo::DerivedType &, - const typeInfo::SpecialBinding &, const SubscriptValue[]); - -template -static RT_API_ATTRS bool FormattedDerivedTypeIO(IoStatementState &io, - const Descriptor &descriptor, const NonTbpDefinedIoTable *table) { - IoErrorHandler &handler{io.GetIoErrorHandler()}; - // Derived type information must be present for formatted I/O. - const DescriptorAddendum *addendum{descriptor.Addendum()}; - RUNTIME_CHECK(handler, addendum != nullptr); - const typeInfo::DerivedType *type{addendum->derivedType()}; - RUNTIME_CHECK(handler, type != nullptr); - Fortran::common::optional nonTbpSpecial; - const typeInfo::SpecialBinding *special{nullptr}; - if (table) { - if (const auto *definedIo{table->Find(*type, - DIR == Direction::Input ? common::DefinedIo::ReadFormatted - : common::DefinedIo::WriteFormatted)}) { - if (definedIo->subroutine) { - nonTbpSpecial.emplace(DIR == Direction::Input - ? typeInfo::SpecialBinding::Which::ReadFormatted - : typeInfo::SpecialBinding::Which::WriteFormatted, - definedIo->subroutine, definedIo->isDtvArgPolymorphic, false, - false); - special = &*nonTbpSpecial; - } - } - } - if (!special) { - if (const typeInfo::SpecialBinding * - binding{type->FindSpecialBinding(DIR == Direction::Input - ? typeInfo::SpecialBinding::Which::ReadFormatted - : typeInfo::SpecialBinding::Which::WriteFormatted)}) { - if (!table || !table->ignoreNonTbpEntries || binding->isTypeBound()) { - special = binding; - } - } - } - SubscriptValue subscripts[maxRank]; - descriptor.GetLowerBounds(subscripts); - std::size_t numElements{descriptor.Elements()}; - for (std::size_t j{0}; j < numElements; - ++j, descriptor.IncrementSubscripts(subscripts)) { - Fortran::common::optional result; - if (special) { - result = DefinedFormattedIo(io, descriptor, *type, *special, subscripts); - } - if (!result) { - result = DefaultComponentwiseFormattedIO( - io, descriptor, *type, table, subscripts); - } - if (!result.value()) { - // Return true for NAMELIST input if we got anything. - auto *listInput{ - io.get_if>()}; - return DIR == Direction::Input && j > 0 && listInput && - listInput->inNamelistSequence(); - } - } - return true; -} - -RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &, const Descriptor &, - const typeInfo::DerivedType &, const typeInfo::SpecialBinding &); +extern template RT_API_ATTRS bool DescriptorIO( + IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *); +extern template RT_API_ATTRS bool DescriptorIO( + IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *); -// Unformatted I/O -template -static RT_API_ATTRS bool UnformattedDescriptorIO(IoStatementState &io, - const Descriptor &descriptor, const NonTbpDefinedIoTable *table = nullptr) { - IoErrorHandler &handler{io.GetIoErrorHandler()}; - const DescriptorAddendum *addendum{descriptor.Addendum()}; - if (const typeInfo::DerivedType * - type{addendum ? addendum->derivedType() : nullptr}) { - // derived type unformatted I/O - if (table) { - if (const auto *definedIo{table->Find(*type, - DIR == Direction::Input ? common::DefinedIo::ReadUnformatted - : common::DefinedIo::WriteUnformatted)}) { - if (definedIo->subroutine) { - typeInfo::SpecialBinding special{DIR == Direction::Input - ? typeInfo::SpecialBinding::Which::ReadUnformatted - : typeInfo::SpecialBinding::Which::WriteUnformatted, - definedIo->subroutine, definedIo->isDtvArgPolymorphic, false, - false}; - if (Fortran::common::optional wasDefined{ - DefinedUnformattedIo(io, descriptor, *type, special)}) { - return *wasDefined; - } - } else { - return DefaultComponentwiseUnformattedIO( - io, descriptor, *type, table); - } - } - } - if (const typeInfo::SpecialBinding * - special{type->FindSpecialBinding(DIR == Direction::Input - ? typeInfo::SpecialBinding::Which::ReadUnformatted - : typeInfo::SpecialBinding::Which::WriteUnformatted)}) { - if (!table || !table->ignoreNonTbpEntries || special->isTypeBound()) { - // defined derived type unformatted I/O - return DefinedUnformattedIo(io, descriptor, *type, *special); - } - } - // Default derived type unformatted I/O - // TODO: If no component at any level has defined READ or WRITE - // (as appropriate), the elements are contiguous, and no byte swapping - // is active, do a block transfer via the code below. - return DefaultComponentwiseUnformattedIO(io, descriptor, *type, table); - } else { - // intrinsic type unformatted I/O - auto *externalUnf{io.get_if>()}; - auto *childUnf{io.get_if>()}; - auto *inq{ - DIR == Direction::Output ? io.get_if() : nullptr}; - RUNTIME_CHECK(handler, externalUnf || childUnf || inq); - std::size_t elementBytes{descriptor.ElementBytes()}; - std::size_t numElements{descriptor.Elements()}; - std::size_t swappingBytes{elementBytes}; - if (auto maybeCatAndKind{descriptor.type().GetCategoryAndKind()}) { - // Byte swapping units can be smaller than elements, namely - // for COMPLEX and CHARACTER. - if (maybeCatAndKind->first == TypeCategory::Character) { - // swap each character position independently - swappingBytes = maybeCatAndKind->second; // kind - } else if (maybeCatAndKind->first == TypeCategory::Complex) { - // swap real and imaginary components independently - swappingBytes /= 2; - } - } - SubscriptValue subscripts[maxRank]; - descriptor.GetLowerBounds(subscripts); - using CharType = - std::conditional_t; - auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool { - if constexpr (DIR == Direction::Output) { - return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes) - : childUnf ? childUnf->Emit(&x, totalBytes, swappingBytes) - : inq->Emit(&x, totalBytes, swappingBytes); - } else { - return externalUnf ? externalUnf->Receive(&x, totalBytes, swappingBytes) - : childUnf->Receive(&x, totalBytes, swappingBytes); - } - }}; - bool swapEndianness{externalUnf && externalUnf->unit().swapEndianness()}; - if (!swapEndianness && - descriptor.IsContiguous()) { // contiguous unformatted I/O - char &x{ExtractElement(io, descriptor, subscripts)}; - return Transfer(x, numElements * elementBytes); - } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O - for (std::size_t j{0}; j < numElements; ++j) { - char &x{ExtractElement(io, descriptor, subscripts)}; - if (!Transfer(x, elementBytes)) { - return false; - } - if (!descriptor.IncrementSubscripts(subscripts) && - j + 1 < numElements) { - handler.Crash("DescriptorIO: subscripts out of bounds"); - } - } - return true; - } - } -} - -template -static RT_API_ATTRS bool DescriptorIO(IoStatementState &io, - const Descriptor &descriptor, const NonTbpDefinedIoTable *table) { - IoErrorHandler &handler{io.GetIoErrorHandler()}; - if (handler.InError()) { - return false; - } - if (!io.get_if>()) { - handler.Crash("DescriptorIO() called for wrong I/O direction"); - return false; - } - if constexpr (DIR == Direction::Input) { - if (!io.BeginReadingRecord()) { - return false; - } - } - if (!io.get_if>()) { - return UnformattedDescriptorIO(io, descriptor, table); - } - if (auto catAndKind{descriptor.type().GetCategoryAndKind()}) { - TypeCategory cat{catAndKind->first}; - int kind{catAndKind->second}; - switch (cat) { - case TypeCategory::Integer: - switch (kind) { - case 1: - return FormattedIntegerIO<1, DIR>(io, descriptor, true); - case 2: - return FormattedIntegerIO<2, DIR>(io, descriptor, true); - case 4: - return FormattedIntegerIO<4, DIR>(io, descriptor, true); - case 8: - return FormattedIntegerIO<8, DIR>(io, descriptor, true); - case 16: - return FormattedIntegerIO<16, DIR>(io, descriptor, true); - default: - handler.Crash( - "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind); - return false; - } - case TypeCategory::Unsigned: - switch (kind) { - case 1: - return FormattedIntegerIO<1, DIR>(io, descriptor, false); - case 2: - return FormattedIntegerIO<2, DIR>(io, descriptor, false); - case 4: - return FormattedIntegerIO<4, DIR>(io, descriptor, false); - case 8: - return FormattedIntegerIO<8, DIR>(io, descriptor, false); - case 16: - return FormattedIntegerIO<16, DIR>(io, descriptor, false); - default: - handler.Crash( - "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind); - return false; - } - case TypeCategory::Real: - switch (kind) { - case 2: - return FormattedRealIO<2, DIR>(io, descriptor); - case 3: - return FormattedRealIO<3, DIR>(io, descriptor); - case 4: - return FormattedRealIO<4, DIR>(io, descriptor); - case 8: - return FormattedRealIO<8, DIR>(io, descriptor); - case 10: - return FormattedRealIO<10, DIR>(io, descriptor); - // TODO: case double/double - case 16: - return FormattedRealIO<16, DIR>(io, descriptor); - default: - handler.Crash( - "not yet implemented: REAL(KIND=%d) in formatted IO", kind); - return false; - } - case TypeCategory::Complex: - switch (kind) { - case 2: - return FormattedComplexIO<2, DIR>(io, descriptor); - case 3: - return FormattedComplexIO<3, DIR>(io, descriptor); - case 4: - return FormattedComplexIO<4, DIR>(io, descriptor); - case 8: - return FormattedComplexIO<8, DIR>(io, descriptor); - case 10: - return FormattedComplexIO<10, DIR>(io, descriptor); - // TODO: case double/double - case 16: - return FormattedComplexIO<16, DIR>(io, descriptor); - default: - handler.Crash( - "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind); - return false; - } - case TypeCategory::Character: - switch (kind) { - case 1: - return FormattedCharacterIO(io, descriptor); - case 2: - return FormattedCharacterIO(io, descriptor); - case 4: - return FormattedCharacterIO(io, descriptor); - default: - handler.Crash( - "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind); - return false; - } - case TypeCategory::Logical: - switch (kind) { - case 1: - return FormattedLogicalIO<1, DIR>(io, descriptor); - case 2: - return FormattedLogicalIO<2, DIR>(io, descriptor); - case 4: - return FormattedLogicalIO<4, DIR>(io, descriptor); - case 8: - return FormattedLogicalIO<8, DIR>(io, descriptor); - default: - handler.Crash( - "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind); - return false; - } - case TypeCategory::Derived: - return FormattedDerivedTypeIO(io, descriptor, table); - } - } - handler.Crash("DescriptorIO: bad type code (%d) in descriptor", - static_cast(descriptor.type().raw())); - return false; -} } // namespace Fortran::runtime::io::descr #endif // FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_ diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp index 1d5304254ed0e..0f0564403c0e2 100644 --- a/flang-rt/lib/runtime/environment.cpp +++ b/flang-rt/lib/runtime/environment.cpp @@ -143,6 +143,10 @@ void ExecutionEnvironment::Configure(int ac, const char *av[], } } + if (auto *x{std::getenv("FLANG_RT_DEBUG")}) { + internalDebugging = std::strtol(x, nullptr, 10); + } + if (auto *x{std::getenv("ACC_OFFLOAD_STACK_SIZE")}) { char *end; auto n{std::strtoul(x, &end, 10)}; diff --git a/flang-rt/lib/runtime/namelist.cpp b/flang-rt/lib/runtime/namelist.cpp index b0cf2180fc6d4..1bef387a9771f 100644 --- a/flang-rt/lib/runtime/namelist.cpp +++ b/flang-rt/lib/runtime/namelist.cpp @@ -10,6 +10,7 @@ #include "descriptor-io.h" #include "flang-rt/runtime/emit-encoded.h" #include "flang-rt/runtime/io-stmt.h" +#include "flang-rt/runtime/type-info.h" #include "flang/Runtime/io-api.h" #include #include diff --git a/flang-rt/lib/runtime/tools.cpp b/flang-rt/lib/runtime/tools.cpp index b08195cd31e05..24d05f369fcbe 100644 --- a/flang-rt/lib/runtime/tools.cpp +++ b/flang-rt/lib/runtime/tools.cpp @@ -205,7 +205,7 @@ RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from, // Doing the recursion upwards instead of downwards puts the more common // cases earlier in the if-chain and has a tangible impact on performance. template struct ShallowCopyRankSpecialize { - static bool execute(const Descriptor &to, const Descriptor &from, + static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from, bool toIsContiguous, bool fromIsContiguous) { if (to.rank() == RANK && from.rank() == RANK) { ShallowCopyInner(to, from, toIsContiguous, fromIsContiguous); @@ -217,7 +217,7 @@ template struct ShallowCopyRankSpecialize { }; template struct ShallowCopyRankSpecialize { - static bool execute(const Descriptor &to, const Descriptor &from, + static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from, bool toIsContiguous, bool fromIsContiguous) { return false; } diff --git a/flang-rt/lib/runtime/type-info.cpp b/flang-rt/lib/runtime/type-info.cpp index 82182696d70c6..451213202acef 100644 --- a/flang-rt/lib/runtime/type-info.cpp +++ b/flang-rt/lib/runtime/type-info.cpp @@ -140,11 +140,11 @@ RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor, const SubscriptValue *subscripts) const { RUNTIME_CHECK(terminator, genre_ == Genre::Data); EstablishDescriptor(descriptor, container, terminator); + std::size_t offset{offset_}; if (subscripts) { - descriptor.set_base_addr(container.Element(subscripts) + offset_); - } else { - descriptor.set_base_addr(container.OffsetElement() + offset_); + offset += container.SubscriptsToByteOffset(subscripts); } + descriptor.set_base_addr(container.OffsetElement() + offset); descriptor.raw().attribute = CFI_attribute_pointer; } diff --git a/flang-rt/lib/runtime/work-queue.cpp b/flang-rt/lib/runtime/work-queue.cpp new file mode 100644 index 0000000000000..a508ecb637102 --- /dev/null +++ b/flang-rt/lib/runtime/work-queue.cpp @@ -0,0 +1,161 @@ +//===-- lib/runtime/work-queue.cpp ------------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "flang-rt/runtime/work-queue.h" +#include "flang-rt/runtime/environment.h" +#include "flang-rt/runtime/memory.h" +#include "flang-rt/runtime/type-info.h" +#include "flang/Common/visit.h" + +namespace Fortran::runtime { + +#if !defined(RT_DEVICE_COMPILATION) +// FLANG_RT_DEBUG code is disabled when false. +static constexpr bool enableDebugOutput{false}; +#endif + +RT_OFFLOAD_API_GROUP_BEGIN + +RT_API_ATTRS Componentwise::Componentwise(const typeInfo::DerivedType &derived) + : derived_{derived}, components_{derived_.component().Elements()} { + GetComponent(); +} + +RT_API_ATTRS void Componentwise::GetComponent() { + if (IsComplete()) { + component_ = nullptr; + } else { + const Descriptor &componentDesc{derived_.component()}; + component_ = componentDesc.ZeroBasedIndexedElement( + componentAt_); + } +} + +RT_API_ATTRS int Ticket::Continue(WorkQueue &workQueue) { + if (!begun) { + begun = true; + return common::visit( + [&workQueue]( + auto &specificTicket) { return specificTicket.Begin(workQueue); }, + u); + } else { + return common::visit( + [&workQueue](auto &specificTicket) { + return specificTicket.Continue(workQueue); + }, + u); + } +} + +RT_API_ATTRS WorkQueue::~WorkQueue() { + if (last_) { + if ((last_->next = firstFree_)) { + last_->next->previous = last_; + } + firstFree_ = first_; + first_ = last_ = nullptr; + } + while (firstFree_) { + TicketList *next{firstFree_->next}; + if (!firstFree_->isStatic) { + FreeMemory(firstFree_); + } + firstFree_ = next; + } +} + +RT_API_ATTRS Ticket &WorkQueue::StartTicket() { + if (!firstFree_) { + void *p{AllocateMemoryOrCrash(terminator_, sizeof(TicketList))}; + firstFree_ = new (p) TicketList; + firstFree_->isStatic = false; + } + TicketList *newTicket{firstFree_}; + if ((firstFree_ = newTicket->next)) { + firstFree_->previous = nullptr; + } + TicketList *after{insertAfter_ ? insertAfter_->next : nullptr}; + if ((newTicket->previous = insertAfter_ ? insertAfter_ : last_)) { + newTicket->previous->next = newTicket; + } else { + first_ = newTicket; + } + if ((newTicket->next = after)) { + after->previous = newTicket; + } else { + last_ = newTicket; + } + newTicket->ticket.begun = false; +#if !defined(RT_DEVICE_COMPILATION) + if (enableDebugOutput && + (executionEnvironment.internalDebugging & + ExecutionEnvironment::WorkQueue)) { + std::fprintf(stderr, "WQ: new ticket\n"); + } +#endif + return newTicket->ticket; +} + +RT_API_ATTRS int WorkQueue::Run() { + while (last_) { + TicketList *at{last_}; + insertAfter_ = last_; +#if !defined(RT_DEVICE_COMPILATION) + if (enableDebugOutput && + (executionEnvironment.internalDebugging & + ExecutionEnvironment::WorkQueue)) { + std::fprintf(stderr, "WQ: %zd %s\n", at->ticket.u.index(), + at->ticket.begun ? "Continue" : "Begin"); + } +#endif + int stat{at->ticket.Continue(*this)}; +#if !defined(RT_DEVICE_COMPILATION) + if (enableDebugOutput && + (executionEnvironment.internalDebugging & + ExecutionEnvironment::WorkQueue)) { + std::fprintf(stderr, "WQ: ... stat %d\n", stat); + } +#endif + insertAfter_ = nullptr; + if (stat == StatOk) { + if (at->previous) { + at->previous->next = at->next; + } else { + first_ = at->next; + } + if (at->next) { + at->next->previous = at->previous; + } else { + last_ = at->previous; + } + if ((at->next = firstFree_)) { + at->next->previous = at; + } + at->previous = nullptr; + firstFree_ = at; + } else if (stat != StatContinue) { + Stop(); + return stat; + } + } + return StatOk; +} + +RT_API_ATTRS void WorkQueue::Stop() { + if (last_) { + if ((last_->next = firstFree_)) { + last_->next->previous = last_; + } + firstFree_ = first_; + first_ = last_ = nullptr; + } +} + +RT_OFFLOAD_API_GROUP_END + +} // namespace Fortran::runtime diff --git a/flang-rt/unittests/Runtime/ExternalIOTest.cpp b/flang-rt/unittests/Runtime/ExternalIOTest.cpp index 3833e48be3dd6..6c148b1de6f82 100644 --- a/flang-rt/unittests/Runtime/ExternalIOTest.cpp +++ b/flang-rt/unittests/Runtime/ExternalIOTest.cpp @@ -184,7 +184,7 @@ TEST(ExternalIOTests, TestSequentialFixedUnformatted) { io = IONAME(BeginInquireIoLength)(__FILE__, __LINE__); for (int j{1}; j <= 3; ++j) { ASSERT_TRUE(IONAME(OutputDescriptor)(io, desc)) - << "OutputDescriptor() for InquireIoLength"; + << "OutputDescriptor() for InquireIoLength " << j; } ASSERT_EQ(IONAME(GetIoLength)(io), 3 * recl) << "GetIoLength"; ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md index 78d871c593e1d..871749934810c 100644 --- a/flang/docs/Extensions.md +++ b/flang/docs/Extensions.md @@ -858,6 +858,16 @@ print *, [(j,j=1,10)] warning since such values may have become defined by the time the nested expression's value is required. +* Intrinsic assignment of arrays is defined elementally, and intrinsic + assignment of derived type components is defined componentwise. + However, when intrinsic assignment takes place for an array of derived + type, the order of the loop nesting is not defined. + Some compilers will loop over the elements, assigning all of the components + of each element before proceeding to the next element. + This compiler loops over all of the components, and assigns all of + the elements for each component before proceeding to the next component. + A program using defined assignment might be able to detect the difference. + ## De Facto Standard Features * `EXTENDS_TYPE_OF()` returns `.TRUE.` if both of its arguments have the diff --git a/flang/include/flang/Runtime/assign.h b/flang/include/flang/Runtime/assign.h index bc80997a1bec2..eb1f63184a177 100644 --- a/flang/include/flang/Runtime/assign.h +++ b/flang/include/flang/Runtime/assign.h @@ -38,7 +38,7 @@ enum AssignFlags { ComponentCanBeDefinedAssignment = 1 << 3, ExplicitLengthCharacterLHS = 1 << 4, PolymorphicLHS = 1 << 5, - DeallocateLHS = 1 << 6 + DeallocateLHS = 1 << 6, }; #ifdef RT_DEVICE_COMPILATION diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h index 3839bc1d2a215..79f7032aac312 100644 --- a/flang/include/flang/Semantics/tools.h +++ b/flang/include/flang/Semantics/tools.h @@ -182,9 +182,12 @@ const Symbol *HasImpureFinal( const Symbol &, std::optional rank = std::nullopt); // Is this type finalizable or does it contain any polymorphic allocatable // ultimate components? -bool MayRequireFinalization(const DerivedTypeSpec &derived); +bool MayRequireFinalization(const DerivedTypeSpec &); // Does this type have an allocatable direct component? -bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived); +bool HasAllocatableDirectComponent(const DerivedTypeSpec &); +// Does this type have any defined assignment at any level (or any polymorphic +// allocatable)? +bool MayHaveDefinedAssignment(const DerivedTypeSpec &); bool IsInBlankCommon(const Symbol &); bool IsAssumedLengthCharacter(const Symbol &); diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp index ccc5e37c840a9..2a862e0e2858b 100644 --- a/flang/lib/Semantics/runtime-type-info.cpp +++ b/flang/lib/Semantics/runtime-type-info.cpp @@ -661,6 +661,10 @@ const Symbol *RuntimeTableBuilder::DescribeType( AddValue(dtValues, derivedTypeSchema_, "nofinalizationneeded"s, IntExpr<1>( derivedTypeSpec && !MayRequireFinalization(*derivedTypeSpec))); + // Similarly, a flag to enable optimized runtime assignment. + AddValue(dtValues, derivedTypeSchema_, "nodefinedassignment"s, + IntExpr<1>( + derivedTypeSpec && !MayHaveDefinedAssignment(*derivedTypeSpec))); } dtObject.get().set_init(MaybeExpr{ StructureExpr(Structure(derivedTypeSchema_, std::move(dtValues)))}); diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp index 1d1e3ac044166..3247addc905ba 100644 --- a/flang/lib/Semantics/tools.cpp +++ b/flang/lib/Semantics/tools.cpp @@ -813,6 +813,38 @@ bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived) { return std::any_of(directs.begin(), directs.end(), IsAllocatable); } +static bool MayHaveDefinedAssignment( + const DerivedTypeSpec &derived, std::set &checked) { + if (const Scope *scope{derived.GetScope()}; + scope && checked.find(scope) == checked.end()) { + checked.insert(scope); + for (const auto &[_, symbolRef] : *scope) { + if (const auto *generic{symbolRef->detailsIf()}) { + if (generic->kind().IsAssignment()) { + return true; + } + } else if (symbolRef->has() && + !IsPointer(*symbolRef)) { + if (const DeclTypeSpec *type{symbolRef->GetType()}) { + if (type->IsPolymorphic()) { + return true; + } else if (const DerivedTypeSpec *derived{type->AsDerived()}) { + if (MayHaveDefinedAssignment(*derived, checked)) { + return true; + } + } + } + } + } + } + return false; +} + +bool MayHaveDefinedAssignment(const DerivedTypeSpec &derived) { + std::set checked; + return MayHaveDefinedAssignment(derived, checked); +} + bool IsAssumedLengthCharacter(const Symbol &symbol) { if (const DeclTypeSpec * type{symbol.GetType()}) { return type->category() == DeclTypeSpec::Character && diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90 index b30a6bf697563..7226b06504d28 100644 --- a/flang/module/__fortran_type_info.f90 +++ b/flang/module/__fortran_type_info.f90 @@ -52,7 +52,8 @@ integer(1) :: noInitializationNeeded ! 1 if no component w/ init integer(1) :: noDestructionNeeded ! 1 if no component w/ dealloc/final integer(1) :: noFinalizationNeeded ! 1 if nothing finalizeable - integer(1) :: __padding0(4) + integer(1) :: noDefinedAssignment ! 1 if no defined ASSIGNMENT(=) + integer(1) :: __padding0(3) end type type :: Binding diff --git a/flang/test/Lower/volatile-openmp.f90 b/flang/test/Lower/volatile-openmp.f90 index 28f0bf78f33c9..2e05b652822b5 100644 --- a/flang/test/Lower/volatile-openmp.f90 +++ b/flang/test/Lower/volatile-openmp.f90 @@ -23,11 +23,11 @@ ! CHECK: %[[VAL_11:.*]] = fir.address_of(@_QFEcontainer) : !fir.ref>>}>> ! CHECK: %[[VAL_12:.*]] = fir.volatile_cast %[[VAL_11]] : (!fir.ref>>}>>) -> !fir.ref>>}>, volatile> ! CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFEcontainer"} : (!fir.ref>>}>, volatile>) -> (!fir.ref>>}>, volatile>, !fir.ref>>}>, volatile>) -! CHECK: %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>> +! CHECK: %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>> ! CHECK: %[[VAL_15:.*]] = fir.shape_shift %[[VAL_0]], %[[VAL_1]] : (index, index) -> !fir.shapeshift<1> -! CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QFE.c.t"} : (!fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>) -! CHECK: %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>> -! CHECK: %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFE.dt.t"} : (!fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>) -> (!fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>, !fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>) +! CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QFE.c.t"} : (!fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>) +! CHECK: %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>> +! CHECK: %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFE.dt.t"} : (!fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) -> (!fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>, !fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) ! CHECK: %[[VAL_19:.*]] = hlfir.designate %[[VAL_13]]#0{"array"} {fortran_attrs = #fir.var_attrs} : (!fir.ref>>}>, volatile>) -> !fir.ref>>, volatile> ! CHECK: %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref>>, volatile> ! CHECK: %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_0]] : (!fir.box>>, index) -> (index, index, index) diff --git a/flang/test/Semantics/typeinfo01.f90 b/flang/test/Semantics/typeinfo01.f90 index d228cd2a84ca4..7dc92504aeebf 100644 --- a/flang/test/Semantics/typeinfo01.f90 +++ b/flang/test/Semantics/typeinfo01.f90 @@ -8,7 +8,7 @@ module m01 end type !CHECK: Module scope: m01 !CHECK: .c.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.n,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())] -!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) !CHECK: .n.n, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"n" !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1" !CHECK: DerivedType scope: t1 @@ -23,8 +23,8 @@ module m02 end type !CHECK: .c.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:1_8 init:[component::component(name=.n.parent,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.parent,lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.cn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=4_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())] !CHECK: .c.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.pn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())] -!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) -!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) end module module m03 @@ -35,7 +35,7 @@ module m03 type(kpdt(4)) :: x !CHECK: .c.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.a,genre=1_1,category=2_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())] !CHECK: .dt.kpdt, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.kpdt,uninstantiated=NULL(),kindparameter=.kp.kpdt,lenparameterkind=NULL()) -!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) !CHECK: .kp.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(8) shape: 0_8:0_8 init:[INTEGER(8)::4_8] end module @@ -49,7 +49,7 @@ module m04 subroutine s1(x) class(tbps), intent(in) :: x end subroutine -!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) !CHECK: .v.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=s1,name=.n.b1),binding(proc=s1,name=.n.b2)] end module @@ -61,7 +61,7 @@ module m05 subroutine s1(x) class(t), intent(in) :: x end subroutine -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) !CHECK: .p.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(procptrcomponent) shape: 0_8:0_8 init:[procptrcomponent::procptrcomponent(name=.n.p1,offset=0_8,initialization=s1)] end module @@ -85,8 +85,8 @@ subroutine s2(x, y) class(t), intent(in) :: y end subroutine !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())] -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) -!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) +!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)] !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)] !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)] @@ -113,8 +113,8 @@ subroutine s2(x, y) class(t2), intent(in) :: y end subroutine !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())] -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) -!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) +!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)] !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)] !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)] @@ -132,7 +132,7 @@ impure elemental subroutine s1(x, y) class(t), intent(out) :: x class(t), intent(in) :: y end subroutine -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)] !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)] end module @@ -155,7 +155,7 @@ impure elemental subroutine s3(x) subroutine s4(x) type(t), contiguous :: x(:,:,:) end subroutine -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1) !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=1_1,proc=s4)] end module @@ -197,7 +197,7 @@ subroutine wu(x,u,iostat,iomsg) integer, intent(out) :: iostat character(len=*), intent(inout) :: iomsg end subroutine -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wu)] !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:3_8 init:[binding::binding(proc=rf,name=.n.rf),binding(proc=ru,name=.n.ru),binding(proc=wf,name=.n.wf),binding(proc=wu,name=.n.wu)] end module @@ -246,7 +246,7 @@ subroutine wu(x,u,iostat,iomsg) integer, intent(out) :: iostat character(len=*), intent(inout) :: iomsg end subroutine -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)] end module @@ -263,7 +263,7 @@ module m11 !CHECK: .c.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:3_8 init:[component::component(name=.n.allocatable,genre=3_1,category=2_1,kind=4_1,rank=1_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.pointer,genre=2_1,category=2_1,kind=4_1,rank=0_1,offset=48_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=.di.t.pointer),component(name=.n.chauto,genre=4_1,category=4_1,kind=1_1,rank=0_1,offset=72_8,characterlen=value(genre=3_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.automatic,genre=4_1,category=2_1,kind=4_1,rank=1_1,offset=96_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=.b.t.automatic,initialization=NULL())] !CHECK: .di.t.pointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(.dp.t.pointer) init:.dp.t.pointer(pointer=target) !CHECK: .dp.t.pointer (CompilerCreated): DerivedType components: pointer -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) !CHECK: .lpk.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::8_1] !CHECK: DerivedType scope: .dp.t.pointer size=24 alignment=8 instantiation of .dp.t.pointer !CHECK: pointer, POINTER size=24 offset=0: ObjectEntity type: REAL(4) diff --git a/flang/test/Semantics/typeinfo03.f90 b/flang/test/Semantics/typeinfo03.f90 index f0c0a817da4a4..e2552d0a21d6f 100644 --- a/flang/test/Semantics/typeinfo03.f90 +++ b/flang/test/Semantics/typeinfo03.f90 @@ -6,4 +6,4 @@ module m class(*), pointer :: sp, ap(:) end type end module -!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) diff --git a/flang/test/Semantics/typeinfo04.f90 b/flang/test/Semantics/typeinfo04.f90 index de8464321a409..94dd2199db35a 100644 --- a/flang/test/Semantics/typeinfo04.f90 +++ b/flang/test/Semantics/typeinfo04.f90 @@ -7,18 +7,18 @@ module m contains final :: final end type -!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) +!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1) type, abstract :: t1 end type -!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) type, abstract :: t2 real, allocatable :: a(:) end type -!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1) +!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) type, abstract :: t3 type(finalizable) :: x end type -!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) +!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1) contains impure elemental subroutine final(x) type(finalizable), intent(in out) :: x diff --git a/flang/test/Semantics/typeinfo05.f90 b/flang/test/Semantics/typeinfo05.f90 index 2a7f12a153eb8..df1aecf3821de 100644 --- a/flang/test/Semantics/typeinfo05.f90 +++ b/flang/test/Semantics/typeinfo05.f90 @@ -7,10 +7,10 @@ program main type t1 type(t2), pointer :: b end type t1 -!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) type :: t2 type(t1) :: a end type t2 -! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) end program main diff --git a/flang/test/Semantics/typeinfo06.f90 b/flang/test/Semantics/typeinfo06.f90 index 2385709a8eb44..22f37b1a4369d 100644 --- a/flang/test/Semantics/typeinfo06.f90 +++ b/flang/test/Semantics/typeinfo06.f90 @@ -7,10 +7,10 @@ program main type t1 type(t2), allocatable :: b end type t1 -!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1) +!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) type :: t2 type(t1) :: a end type t2 -! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1) +! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) end program main diff --git a/flang/test/Semantics/typeinfo07.f90 b/flang/test/Semantics/typeinfo07.f90 index e8766d9811db8..ab20d6f601106 100644 --- a/flang/test/Semantics/typeinfo07.f90 +++ b/flang/test/Semantics/typeinfo07.f90 @@ -16,7 +16,7 @@ type(t_container_extension) :: wrapper end type end -! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) -! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) -! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1) -! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) +! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1) +! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1) +! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1) diff --git a/flang/test/Semantics/typeinfo08.f90 b/flang/test/Semantics/typeinfo08.f90 index 689cf469dee3b..391a66f3d6664 100644 --- a/flang/test/Semantics/typeinfo08.f90 +++ b/flang/test/Semantics/typeinfo08.f90 @@ -13,7 +13,7 @@ module m !CHECK: Module scope: m size=0 alignment=1 sourceRange=113 bytes !CHECK: .c.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t1,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),lenvalue=NULL(),bounds=NULL(),initialization=NULL())] -!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) !CHECK: .lpk.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::4_1] !CHECK: .n.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"s" !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1" diff --git a/flang/test/Semantics/typeinfo11.f90 b/flang/test/Semantics/typeinfo11.f90 index 92efc8f9ea54b..08e0b95abb763 100644 --- a/flang/test/Semantics/typeinfo11.f90 +++ b/flang/test/Semantics/typeinfo11.f90 @@ -14,4 +14,4 @@ type(t2) x end -!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) +!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1) diff --git a/flang/test/Semantics/typeinfo12.f90 b/flang/test/Semantics/typeinfo12.f90 new file mode 100644 index 0000000000000..6b23b63d28b1d --- /dev/null +++ b/flang/test/Semantics/typeinfo12.f90 @@ -0,0 +1,67 @@ +!RUN: bbc --dump-symbols %s | FileCheck %s +!Check "nodefinedassignment" settings. + +module m01 + + type hasAsst1 + contains + procedure asst1 + generic :: assignment(=) => asst1 + end type +!CHECK: .dt.hasasst1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.hasasst1,name=.n.hasasst1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.hasasst1,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) + + type hasAsst2 ! no defined assignment relevant to the runtime + end type + interface assignment(=) + procedure asst2 + end interface +!CHECK: .dt.hasasst2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.hasasst2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) + + type test1 + type(hasAsst1) c + end type +!CHECK: .dt.test1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) + + type test2 + type(hasAsst2) c + end type +!CHECK: .dt.test2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) + + type test3 + type(hasAsst1), pointer :: p + end type +!CHECK: .dt.test3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test3,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test3,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) + + type test4 + type(hasAsst2), pointer :: p + end type +!CHECK: .dt.test4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test4,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) + + type, extends(hasAsst1) :: test5 + end type +!CHECK: .dt.test5, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.test5,name=.n.test5,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test5,procptr=NULL(),special=.s.test5,specialbitset=4_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) + + type, extends(hasAsst2) :: test6 + end type +!CHECK: .dt.test6, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test6,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test6,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) + + type test7 + type(test7), allocatable :: c + end type +!CHECK: .dt.test7, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test7,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test7,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) + + type test8 + class(test8), allocatable :: c + end type +!CHECK: .dt.test8, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test8,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test8,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1) + + contains + impure elemental subroutine asst1(left, right) + class(hasAsst1), intent(out) :: left + class(hasAsst1), intent(in) :: right + end + impure elemental subroutine asst2(left, right) + class(hasAsst2), intent(out) :: left + class(hasAsst2), intent(in) :: right + end +end From b994a4c04f38d8cfb13f3dbf3d99146cb778443e Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Jun 2025 14:44:41 -0700 Subject: [PATCH 008/851] [flang][NFC] Clean up code in two new functions (#142037) Two recently-added functions in Semantics/tools.h need some cleaning up to conform to the coding style of the project. One of them should actually be in Parser/tools.{h,cpp}, the other doesn't need to be defined in the header. --- flang/include/flang/Parser/tools.h | 3 +++ flang/include/flang/Semantics/tools.h | 26 ++------------------- flang/lib/Lower/OpenACC.cpp | 4 ++-- flang/lib/Lower/OpenMP/OpenMP.cpp | 4 ++-- flang/lib/Parser/tools.cpp | 5 ++++ flang/lib/Semantics/check-omp-structure.cpp | 8 +++---- flang/lib/Semantics/tools.cpp | 14 +++++++++++ 7 files changed, 32 insertions(+), 32 deletions(-) diff --git a/flang/include/flang/Parser/tools.h b/flang/include/flang/Parser/tools.h index f1ead11734fa0..447bccd5d35a6 100644 --- a/flang/include/flang/Parser/tools.h +++ b/flang/include/flang/Parser/tools.h @@ -250,5 +250,8 @@ template std::optional GetLastSource(A &x) { return GetSourceHelper::GetSource(const_cast(x)); } +// Checks whether the assignment statement has a single variable on the RHS. +bool CheckForSingleVariableOnRHS(const AssignmentStmt &); + } // namespace Fortran::parser #endif // FORTRAN_PARSER_TOOLS_H_ diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h index 79f7032aac312..51df7c40f5b8b 100644 --- a/flang/include/flang/Semantics/tools.h +++ b/flang/include/flang/Semantics/tools.h @@ -756,29 +756,7 @@ std::string GetCommonBlockObjectName(const Symbol &, bool underscoring); // Check for ambiguous USE associations bool HadUseError(SemanticsContext &, SourceName at, const Symbol *); -/// Checks if the assignment statement has a single variable on the RHS. -inline bool checkForSingleVariableOnRHS( - const Fortran::parser::AssignmentStmt &assignmentStmt) { - const Fortran::parser::Expr &expr{ - std::get(assignmentStmt.t)}; - const Fortran::common::Indirection *designator = - std::get_if>( - &expr.u); - return designator != nullptr; -} - -/// Checks if the symbol on the LHS is present in the RHS expression. -inline bool checkForSymbolMatch(const Fortran::semantics::SomeExpr *lhs, - const Fortran::semantics::SomeExpr *rhs) { - auto lhsSyms{Fortran::evaluate::GetSymbolVector(*lhs)}; - const Fortran::semantics::Symbol &lhsSymbol{*lhsSyms.front()}; - for (const Fortran::semantics::Symbol &symbol : - Fortran::evaluate::GetSymbolVector(*rhs)) { - if (lhsSymbol == symbol) { - return true; - } - } - return false; -} +// Checks whether the symbol on the LHS is present in the RHS expression. +bool CheckForSymbolMatch(const SomeExpr *lhs, const SomeExpr *rhs); } // namespace Fortran::semantics #endif // FORTRAN_SEMANTICS_TOOLS_H_ diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index 02dba22c29c7f..c10e1777614cd 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -653,8 +653,8 @@ void genAtomicCapture(Fortran::lower::AbstractConverter &converter, firOpBuilder.createBlock(&(atomicCaptureOp->getRegion(0))); mlir::Block &block = atomicCaptureOp->getRegion(0).back(); firOpBuilder.setInsertionPointToStart(&block); - if (Fortran::semantics::checkForSingleVariableOnRHS(stmt1)) { - if (Fortran::semantics::checkForSymbolMatch( + if (Fortran::parser::CheckForSingleVariableOnRHS(stmt1)) { + if (Fortran::semantics::CheckForSymbolMatch( Fortran::semantics::GetExpr(stmt2Var), Fortran::semantics::GetExpr(stmt2Expr))) { // Atomic capture construct is of the form [capture-stmt, update-stmt] diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 6892e571e62a3..784749bba5a0c 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -3200,8 +3200,8 @@ static void genAtomicCapture(lower::AbstractConverter &converter, firOpBuilder.createBlock(&(atomicCaptureOp->getRegion(0))); mlir::Block &block = atomicCaptureOp->getRegion(0).back(); firOpBuilder.setInsertionPointToStart(&block); - if (semantics::checkForSingleVariableOnRHS(stmt1)) { - if (semantics::checkForSymbolMatch(semantics::GetExpr(stmt2Var), + if (parser::CheckForSingleVariableOnRHS(stmt1)) { + if (semantics::CheckForSymbolMatch(semantics::GetExpr(stmt2Var), semantics::GetExpr(stmt2Expr))) { // Atomic capture construct is of the form [capture-stmt, update-stmt] const semantics::SomeExpr &fromExpr = *semantics::GetExpr(stmt1Expr); diff --git a/flang/lib/Parser/tools.cpp b/flang/lib/Parser/tools.cpp index 6e5f1ed2fc66f..264ca520f38b8 100644 --- a/flang/lib/Parser/tools.cpp +++ b/flang/lib/Parser/tools.cpp @@ -174,4 +174,9 @@ const CoindexedNamedObject *GetCoindexedNamedObject( }, allocateObject.u); } + +bool CheckForSingleVariableOnRHS(const AssignmentStmt &assignmentStmt) { + return Unwrap(std::get(assignmentStmt.t)) != nullptr; +} + } // namespace Fortran::parser diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index bdd078c33da92..31fcbb9683202 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -2933,9 +2933,9 @@ void OmpStructureChecker::CheckAtomicCaptureConstruct( const auto *e2 = GetExpr(context_, stmt2Expr); if (e1 && v1 && e2 && v2) { - if (semantics::checkForSingleVariableOnRHS(stmt1)) { + if (parser::CheckForSingleVariableOnRHS(stmt1)) { CheckAtomicCaptureStmt(stmt1); - if (semantics::checkForSymbolMatch(v2, e2)) { + if (CheckForSymbolMatch(v2, e2)) { // ATOMIC CAPTURE construct is of the form [capture-stmt, update-stmt] CheckAtomicUpdateStmt(stmt2); } else { @@ -2947,8 +2947,8 @@ void OmpStructureChecker::CheckAtomicCaptureConstruct( "Captured variable/array element/derived-type component %s expected to be assigned in the second statement of ATOMIC CAPTURE construct"_err_en_US, stmt1Expr.source); } - } else if (semantics::checkForSymbolMatch(v1, e1) && - semantics::checkForSingleVariableOnRHS(stmt2)) { + } else if (CheckForSymbolMatch(v1, e1) && + parser::CheckForSingleVariableOnRHS(stmt2)) { // ATOMIC CAPTURE construct is of the form [update-stmt, capture-stmt] CheckAtomicUpdateStmt(stmt1); CheckAtomicCaptureStmt(stmt2); diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp index 3247addc905ba..ea5ab2d455b54 100644 --- a/flang/lib/Semantics/tools.cpp +++ b/flang/lib/Semantics/tools.cpp @@ -1788,4 +1788,18 @@ bool HadUseError( } } +bool CheckForSymbolMatch(const SomeExpr *lhs, const SomeExpr *rhs) { + if (lhs && rhs) { + if (SymbolVector lhsSymbols{evaluate::GetSymbolVector(*lhs)}; + !lhsSymbols.empty()) { + const Symbol &first{*lhsSymbols.front()}; + for (const Symbol &symbol : evaluate::GetSymbolVector(*rhs)) { + if (first == symbol) { + return true; + } + } + } + } + return false; +} } // namespace Fortran::semantics From 54e72d15bc09e9e6464792711b8c475f92a759e2 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Jun 2025 14:44:59 -0700 Subject: [PATCH 009/851] [flang] Ensure overrides of special procedures (#142465) When a derived type declares a generic procedure binding of interest to the runtime library, such as for ASSIGNMENT(=), it overrides any binding that might have been present for the parent type. Fixes https://github.com/llvm/llvm-project/issues/142414. --- flang/lib/Semantics/runtime-type-info.cpp | 4 ++-- flang/test/Semantics/typeinfo13.f90 | 26 +++++++++++++++++++++++ 2 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 flang/test/Semantics/typeinfo13.f90 diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp index 2a862e0e2858b..4c186f4874152 100644 --- a/flang/lib/Semantics/runtime-type-info.cpp +++ b/flang/lib/Semantics/runtime-type-info.cpp @@ -1067,7 +1067,7 @@ RuntimeTableBuilder::DescribeSpecialGenerics(const Scope &dtScope, specials = DescribeSpecialGenerics(*parentScope, thisScope, derivedTypeSpec); } - for (auto pair : dtScope) { + for (const auto &pair : dtScope) { const Symbol &symbol{*pair.second}; if (const auto *generic{symbol.detailsIf()}) { DescribeSpecialGeneric(*generic, specials, thisScope, derivedTypeSpec); @@ -1245,7 +1245,7 @@ void RuntimeTableBuilder::DescribeSpecialProc( AddValue(values, specialSchema_, procCompName, SomeExpr{evaluate::ProcedureDesignator{specific}}); // index might already be present in the case of an override - specials.emplace(*index, + specials.insert_or_assign(*index, evaluate::StructureConstructor{ DEREF(specialSchema_.AsDerived()), std::move(values)}); } diff --git a/flang/test/Semantics/typeinfo13.f90 b/flang/test/Semantics/typeinfo13.f90 new file mode 100644 index 0000000000000..cf4abf9e38181 --- /dev/null +++ b/flang/test/Semantics/typeinfo13.f90 @@ -0,0 +1,26 @@ +!RUN: %flang_fc1 -fdebug-dump-symbols %s | FileCheck %s +!Ensure ASSIGNMENT(=) overrides are applied to the special procedures table. +module m + type base + contains + procedure :: baseAssign + generic :: assignment(=) => baseAssign + end type + type, extends(base) :: child + contains + procedure :: override + generic :: assignment(=) => override + end type + contains + impure elemental subroutine baseAssign(to, from) + class(base), intent(out) :: to + type(base), intent(in) :: from + end + impure elemental subroutine override(to, from) + class(child), intent(out) :: to + type(child), intent(in) :: from + end +end + +!CHECK: .s.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=override)] +!CHECK: .v.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=baseassign,name=.n.baseassign),binding(proc=override,name=.n.override)] From 2f9dfdfb35bdb10334b09476a47dc1d93beea96c Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Tue, 10 Jun 2025 15:11:44 -0700 Subject: [PATCH 010/851] [IR] Simplify scalable vector handling in ShuffleVectorInst::getShuffleMask. NFC (#143596) Combine the scalable vector UndefValue check with the earlier ConstantAggregateZero handling for fixed and scalable vectors. Assert that the rest of the code is only reached for fixed vectors. Use append instead of resize since we know the size is increasing. --- llvm/lib/IR/Instructions.cpp | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp index b29969657e7fc..2d89ec1b0a8d3 100644 --- a/llvm/lib/IR/Instructions.cpp +++ b/llvm/lib/IR/Instructions.cpp @@ -1854,23 +1854,18 @@ void ShuffleVectorInst::getShuffleMask(const Constant *Mask, SmallVectorImpl &Result) { ElementCount EC = cast(Mask->getType())->getElementCount(); - if (isa(Mask)) { - Result.resize(EC.getKnownMinValue(), 0); + if (isa(Mask) || isa(Mask)) { + int MaskVal = isa(Mask) ? -1 : 0; + Result.append(EC.getKnownMinValue(), MaskVal); return; } - Result.reserve(EC.getKnownMinValue()); + assert(!EC.isScalable() && + "Scalable vector shuffle mask must be undef or zeroinitializer"); - if (EC.isScalable()) { - assert((isa(Mask) || isa(Mask)) && - "Scalable vector shuffle mask must be undef or zeroinitializer"); - int MaskVal = isa(Mask) ? -1 : 0; - for (unsigned I = 0; I < EC.getKnownMinValue(); ++I) - Result.emplace_back(MaskVal); - return; - } + unsigned NumElts = EC.getFixedValue(); - unsigned NumElts = EC.getKnownMinValue(); + Result.reserve(NumElts); if (auto *CDS = dyn_cast(Mask)) { for (unsigned i = 0; i != NumElts; ++i) From 32649e017eaa609fa556b6d6d74bb73abf37214d Mon Sep 17 00:00:00 2001 From: "S. VenkataKeerthy" <31350914+svkeerthy@users.noreply.github.com> Date: Tue, 10 Jun 2025 15:12:16 -0700 Subject: [PATCH 011/851] [IR2Vec] Exposing Embedding as an data type wrapped around std::vector (#143197) Currently `Embedding` is `std::vector`. This PR makes it a data type wrapped around `std::vector` to overload basic arithmetic operators and expose comparison operations. It _simplifies_ the usage here and in the passes where operations on `Embedding` would be performed. (Tracking issue - #141817) --- llvm/include/llvm/Analysis/IR2Vec.h | 69 ++++++-- llvm/lib/Analysis/IR2Vec.cpp | 69 +++++--- llvm/unittests/Analysis/IR2VecTest.cpp | 208 ++++++++++++++++++++----- 3 files changed, 274 insertions(+), 72 deletions(-) diff --git a/llvm/include/llvm/Analysis/IR2Vec.h b/llvm/include/llvm/Analysis/IR2Vec.h index 9fd1b0ae8e248..8bf21b0e75d67 100644 --- a/llvm/include/llvm/Analysis/IR2Vec.h +++ b/llvm/include/llvm/Analysis/IR2Vec.h @@ -53,7 +53,63 @@ class raw_ostream; enum class IR2VecKind { Symbolic }; namespace ir2vec { -using Embedding = std::vector; +/// Embedding is a datatype that wraps std::vector. It provides +/// additional functionality for arithmetic and comparison operations. +/// It is meant to be used *like* std::vector but is more restrictive +/// in the sense that it does not allow the user to change the size of the +/// embedding vector. The dimension of the embedding is fixed at the time of +/// construction of Embedding object. But the elements can be modified in-place. +struct Embedding { +private: + std::vector Data; + +public: + Embedding() = default; + Embedding(const std::vector &V) : Data(V) {} + Embedding(std::vector &&V) : Data(std::move(V)) {} + Embedding(std::initializer_list IL) : Data(IL) {} + + explicit Embedding(size_t Size) : Data(Size) {} + Embedding(size_t Size, double InitialValue) : Data(Size, InitialValue) {} + + size_t size() const { return Data.size(); } + bool empty() const { return Data.empty(); } + + double &operator[](size_t Itr) { + assert(Itr < Data.size() && "Index out of bounds"); + return Data[Itr]; + } + + const double &operator[](size_t Itr) const { + assert(Itr < Data.size() && "Index out of bounds"); + return Data[Itr]; + } + + using iterator = typename std::vector::iterator; + using const_iterator = typename std::vector::const_iterator; + + iterator begin() { return Data.begin(); } + iterator end() { return Data.end(); } + const_iterator begin() const { return Data.begin(); } + const_iterator end() const { return Data.end(); } + const_iterator cbegin() const { return Data.cbegin(); } + const_iterator cend() const { return Data.cend(); } + + const std::vector &getData() const { return Data; } + + /// Arithmetic operators + Embedding &operator+=(const Embedding &RHS); + Embedding &operator-=(const Embedding &RHS); + + /// Adds Src Embedding scaled by Factor with the called Embedding. + /// Called_Embedding += Src * Factor + Embedding &scaleAndAdd(const Embedding &Src, float Factor); + + /// Returns true if the embedding is approximately equal to the RHS embedding + /// within the specified tolerance. + bool approximatelyEquals(const Embedding &RHS, double Tolerance = 1e-6) const; +}; + using InstEmbeddingsMap = DenseMap; using BBEmbeddingsMap = DenseMap; // FIXME: Current the keys are strings. This can be changed to @@ -61,8 +117,8 @@ using BBEmbeddingsMap = DenseMap; using Vocab = std::map; /// Embedder provides the interface to generate embeddings (vector -/// representations) for instructions, basic blocks, and functions. The vector -/// representations are generated using IR2Vec algorithms. +/// representations) for instructions, basic blocks, and functions. The +/// vector representations are generated using IR2Vec algorithms. /// /// The Embedder class is an abstract class and it is intended to be /// subclassed for different IR2Vec algorithms like Symbolic and Flow-aware. @@ -99,13 +155,6 @@ class Embedder { /// zero vector. Embedding lookupVocab(const std::string &Key) const; - /// Adds two vectors: Dst += Src - static void addVectors(Embedding &Dst, const Embedding &Src); - - /// Adds Src vector scaled by Factor to Dst vector: Dst += Src * Factor - static void addScaledVector(Embedding &Dst, const Embedding &Src, - float Factor); - public: virtual ~Embedder() = default; diff --git a/llvm/lib/Analysis/IR2Vec.cpp b/llvm/lib/Analysis/IR2Vec.cpp index 490db5fdcdf99..25ce35d4ace37 100644 --- a/llvm/lib/Analysis/IR2Vec.cpp +++ b/llvm/lib/Analysis/IR2Vec.cpp @@ -55,6 +55,51 @@ static cl::opt ArgWeight("ir2vec-arg-weight", cl::Optional, AnalysisKey IR2VecVocabAnalysis::Key; +namespace llvm::json { +inline bool fromJSON(const llvm::json::Value &E, Embedding &Out, + llvm::json::Path P) { + std::vector TempOut; + if (!llvm::json::fromJSON(E, TempOut, P)) + return false; + Out = Embedding(std::move(TempOut)); + return true; +} +} // namespace llvm::json + +// ==----------------------------------------------------------------------===// +// Embedding +//===----------------------------------------------------------------------===// + +Embedding &Embedding::operator+=(const Embedding &RHS) { + assert(this->size() == RHS.size() && "Vectors must have the same dimension"); + std::transform(this->begin(), this->end(), RHS.begin(), this->begin(), + std::plus()); + return *this; +} + +Embedding &Embedding::operator-=(const Embedding &RHS) { + assert(this->size() == RHS.size() && "Vectors must have the same dimension"); + std::transform(this->begin(), this->end(), RHS.begin(), this->begin(), + std::minus()); + return *this; +} + +Embedding &Embedding::scaleAndAdd(const Embedding &Src, float Factor) { + assert(this->size() == Src.size() && "Vectors must have the same dimension"); + for (size_t Itr = 0; Itr < this->size(); ++Itr) + (*this)[Itr] += Src[Itr] * Factor; + return *this; +} + +bool Embedding::approximatelyEquals(const Embedding &RHS, + double Tolerance) const { + assert(this->size() == RHS.size() && "Vectors must have the same dimension"); + for (size_t Itr = 0; Itr < this->size(); ++Itr) + if (std::abs((*this)[Itr] - RHS[Itr]) > Tolerance) + return false; + return true; +} + // ==----------------------------------------------------------------------===// // Embedder and its subclasses //===----------------------------------------------------------------------===// @@ -73,20 +118,6 @@ Embedder::create(IR2VecKind Mode, const Function &F, const Vocab &Vocabulary) { return make_error("Unknown IR2VecKind", errc::invalid_argument); } -void Embedder::addVectors(Embedding &Dst, const Embedding &Src) { - assert(Dst.size() == Src.size() && "Vectors must have the same dimension"); - std::transform(Dst.begin(), Dst.end(), Src.begin(), Dst.begin(), - std::plus()); -} - -void Embedder::addScaledVector(Embedding &Dst, const Embedding &Src, - float Factor) { - assert(Dst.size() == Src.size() && "Vectors must have the same dimension"); - for (size_t i = 0; i < Dst.size(); ++i) { - Dst[i] += Src[i] * Factor; - } -} - // FIXME: Currently lookups are string based. Use numeric Keys // for efficiency Embedding Embedder::lookupVocab(const std::string &Key) const { @@ -164,20 +195,20 @@ void SymbolicEmbedder::computeEmbeddings(const BasicBlock &BB) const { Embedding InstVector(Dimension, 0); const auto OpcVec = lookupVocab(I.getOpcodeName()); - addScaledVector(InstVector, OpcVec, OpcWeight); + InstVector.scaleAndAdd(OpcVec, OpcWeight); // FIXME: Currently lookups are string based. Use numeric Keys // for efficiency. const auto Type = I.getType(); const auto TypeVec = getTypeEmbedding(Type); - addScaledVector(InstVector, TypeVec, TypeWeight); + InstVector.scaleAndAdd(TypeVec, TypeWeight); for (const auto &Op : I.operands()) { const auto OperandVec = getOperandEmbedding(Op.get()); - addScaledVector(InstVector, OperandVec, ArgWeight); + InstVector.scaleAndAdd(OperandVec, ArgWeight); } InstVecMap[&I] = InstVector; - addVectors(BBVector, InstVector); + BBVector += InstVector; } BBVecMap[&BB] = BBVector; } @@ -187,7 +218,7 @@ void SymbolicEmbedder::computeEmbeddings() const { return; for (const auto &BB : F) { computeEmbeddings(BB); - addVectors(FuncVector, BBVecMap[&BB]); + FuncVector += BBVecMap[&BB]; } } diff --git a/llvm/unittests/Analysis/IR2VecTest.cpp b/llvm/unittests/Analysis/IR2VecTest.cpp index 9e47b2cd8bedd..053b9f75e7a66 100644 --- a/llvm/unittests/Analysis/IR2VecTest.cpp +++ b/llvm/unittests/Analysis/IR2VecTest.cpp @@ -14,6 +14,7 @@ #include "llvm/IR/Module.h" #include "llvm/IR/Type.h" #include "llvm/Support/Error.h" +#include "llvm/Support/JSON.h" #include "gmock/gmock.h" #include "gtest/gtest.h" @@ -32,89 +33,209 @@ class TestableEmbedder : public Embedder { void computeEmbeddings() const override {} void computeEmbeddings(const BasicBlock &BB) const override {} using Embedder::lookupVocab; - static void addVectors(Embedding &Dst, const Embedding &Src) { - Embedder::addVectors(Dst, Src); +}; + +TEST(EmbeddingTest, ConstructorsAndAccessors) { + // Default constructor + { + Embedding E; + EXPECT_TRUE(E.empty()); + EXPECT_EQ(E.size(), 0u); } - static void addScaledVector(Embedding &Dst, const Embedding &Src, - float Factor) { - Embedder::addScaledVector(Dst, Src, Factor); + + // Constructor with const std::vector& + { + std::vector Data = {1.0, 2.0, 3.0}; + Embedding E(Data); + EXPECT_FALSE(E.empty()); + ASSERT_THAT(E, SizeIs(3u)); + EXPECT_THAT(E.getData(), ElementsAre(1.0, 2.0, 3.0)); + EXPECT_EQ(E[0], 1.0); + EXPECT_EQ(E[1], 2.0); + EXPECT_EQ(E[2], 3.0); } -}; -TEST(IR2VecTest, CreateSymbolicEmbedder) { - Vocab V = {{"foo", {1.0, 2.0}}}; + // Constructor with std::vector&& + { + Embedding E(std::vector({4.0, 5.0})); + ASSERT_THAT(E, SizeIs(2u)); + EXPECT_THAT(E.getData(), ElementsAre(4.0, 5.0)); + } - LLVMContext Ctx; - Module M("M", Ctx); - FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), false); - Function *F = Function::Create(FTy, Function::ExternalLinkage, "f", M); + // Constructor with std::initializer_list + { + Embedding E({6.0, 7.0, 8.0, 9.0}); + ASSERT_THAT(E, SizeIs(4u)); + EXPECT_THAT(E.getData(), ElementsAre(6.0, 7.0, 8.0, 9.0)); + EXPECT_EQ(E[0], 6.0); + E[0] = 6.5; + EXPECT_EQ(E[0], 6.5); + } - auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V); - EXPECT_TRUE(static_cast(Result)); + // Constructor with size_t + { + Embedding E(5); + ASSERT_THAT(E, SizeIs(5u)); + EXPECT_THAT(E.getData(), ElementsAre(0.0, 0.0, 0.0, 0.0, 0.0)); + } - auto *Emb = Result->get(); - EXPECT_NE(Emb, nullptr); -} + // Constructor with size_t and double + { + Embedding E(5, 1.5); + ASSERT_THAT(E, SizeIs(5u)); + EXPECT_THAT(E.getData(), ElementsAre(1.5, 1.5, 1.5, 1.5, 1.5)); + } -TEST(IR2VecTest, CreateInvalidMode) { - Vocab V = {{"foo", {1.0, 2.0}}}; + // Test iterators + { + Embedding E({6.5, 7.0, 8.0, 9.0}); + std::vector VecE; + for (double Val : E) { + VecE.push_back(Val); + } + EXPECT_THAT(VecE, ElementsAre(6.5, 7.0, 8.0, 9.0)); + + const Embedding CE = E; + std::vector VecCE; + for (const double &Val : CE) { + VecCE.push_back(Val); + } + EXPECT_THAT(VecCE, ElementsAre(6.5, 7.0, 8.0, 9.0)); + + EXPECT_EQ(*E.begin(), 6.5); + EXPECT_EQ(*(E.end() - 1), 9.0); + EXPECT_EQ(*CE.cbegin(), 6.5); + EXPECT_EQ(*(CE.cend() - 1), 9.0); + } +} - LLVMContext Ctx; - Module M("M", Ctx); - FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), false); - Function *F = Function::Create(FTy, Function::ExternalLinkage, "f", M); +TEST(EmbeddingTest, AddVectors) { + Embedding E1 = {1.0, 2.0, 3.0}; + Embedding E2 = {0.5, 1.5, -1.0}; - // static_cast an invalid int to IR2VecKind - auto Result = Embedder::create(static_cast(-1), *F, V); - EXPECT_FALSE(static_cast(Result)); + E1 += E2; + EXPECT_THAT(E1, ElementsAre(1.5, 3.5, 2.0)); - std::string ErrMsg; - llvm::handleAllErrors( - Result.takeError(), - [&](const llvm::ErrorInfoBase &EIB) { ErrMsg = EIB.message(); }); - EXPECT_NE(ErrMsg.find("Unknown IR2VecKind"), std::string::npos); + // Check that E2 is unchanged + EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0)); } -TEST(IR2VecTest, AddVectors) { +TEST(EmbeddingTest, SubtractVectors) { Embedding E1 = {1.0, 2.0, 3.0}; Embedding E2 = {0.5, 1.5, -1.0}; - TestableEmbedder::addVectors(E1, E2); - EXPECT_THAT(E1, ElementsAre(1.5, 3.5, 2.0)); + E1 -= E2; + EXPECT_THAT(E1, ElementsAre(0.5, 0.5, 4.0)); // Check that E2 is unchanged EXPECT_THAT(E2, ElementsAre(0.5, 1.5, -1.0)); } -TEST(IR2VecTest, AddScaledVector) { +TEST(EmbeddingTest, AddScaledVector) { Embedding E1 = {1.0, 2.0, 3.0}; Embedding E2 = {2.0, 0.5, -1.0}; - TestableEmbedder::addScaledVector(E1, E2, 0.5f); + E1.scaleAndAdd(E2, 0.5f); EXPECT_THAT(E1, ElementsAre(2.0, 2.25, 2.5)); // Check that E2 is unchanged EXPECT_THAT(E2, ElementsAre(2.0, 0.5, -1.0)); } +TEST(EmbeddingTest, ApproximatelyEqual) { + Embedding E1 = {1.0, 2.0, 3.0}; + Embedding E2 = {1.0000001, 2.0000001, 3.0000001}; + EXPECT_TRUE(E1.approximatelyEquals(E2)); // Diff = 1e-7 + + Embedding E3 = {1.00002, 2.00002, 3.00002}; // Diff = 2e-5 + EXPECT_FALSE(E1.approximatelyEquals(E3)); + EXPECT_TRUE(E1.approximatelyEquals(E3, 3e-5)); + + Embedding E_clearly_within = {1.0000005, 2.0000005, 3.0000005}; // Diff = 5e-7 + EXPECT_TRUE(E1.approximatelyEquals(E_clearly_within)); + + Embedding E_clearly_outside = {1.00001, 2.00001, 3.00001}; // Diff = 1e-5 + EXPECT_FALSE(E1.approximatelyEquals(E_clearly_outside)); + + Embedding E4 = {1.0, 2.0, 3.5}; // Large diff + EXPECT_FALSE(E1.approximatelyEquals(E4, 0.01)); + + Embedding E5 = {1.0, 2.0, 3.0}; + EXPECT_TRUE(E1.approximatelyEquals(E5, 0.0)); + EXPECT_TRUE(E1.approximatelyEquals(E5)); +} + #if GTEST_HAS_DEATH_TEST #ifndef NDEBUG -TEST(IR2VecTest, MismatchedDimensionsAddVectors) { +TEST(EmbeddingTest, AccessOutOfBounds) { + Embedding E = {1.0, 2.0, 3.0}; + EXPECT_DEATH(E[3], "Index out of bounds"); + EXPECT_DEATH(E[-1], "Index out of bounds"); + EXPECT_DEATH(E[4] = 4.0, "Index out of bounds"); +} + +TEST(EmbeddingTest, MismatchedDimensionsAddVectors) { Embedding E1 = {1.0, 2.0}; Embedding E2 = {1.0}; - EXPECT_DEATH(TestableEmbedder::addVectors(E1, E2), - "Vectors must have the same dimension"); + EXPECT_DEATH(E1 += E2, "Vectors must have the same dimension"); +} + +TEST(EmbeddingTest, MismatchedDimensionsSubtractVectors) { + Embedding E1 = {1.0, 2.0}; + Embedding E2 = {1.0}; + EXPECT_DEATH(E1 -= E2, "Vectors must have the same dimension"); } -TEST(IR2VecTest, MismatchedDimensionsAddScaledVector) { +TEST(EmbeddingTest, MismatchedDimensionsAddScaledVector) { Embedding E1 = {1.0, 2.0}; Embedding E2 = {1.0}; - EXPECT_DEATH(TestableEmbedder::addScaledVector(E1, E2, 1.0f), + EXPECT_DEATH(E1.scaleAndAdd(E2, 1.0f), + "Vectors must have the same dimension"); +} + +TEST(EmbeddingTest, MismatchedDimensionsApproximatelyEqual) { + Embedding E1 = {1.0, 2.0}; + Embedding E2 = {1.010}; + EXPECT_DEATH(E1.approximatelyEquals(E2), "Vectors must have the same dimension"); } #endif // NDEBUG #endif // GTEST_HAS_DEATH_TEST +TEST(IR2VecTest, CreateSymbolicEmbedder) { + Vocab V = {{"foo", {1.0, 2.0}}}; + + LLVMContext Ctx; + Module M("M", Ctx); + FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), false); + Function *F = Function::Create(FTy, Function::ExternalLinkage, "f", M); + + auto Result = Embedder::create(IR2VecKind::Symbolic, *F, V); + EXPECT_TRUE(static_cast(Result)); + + auto *Emb = Result->get(); + EXPECT_NE(Emb, nullptr); +} + +TEST(IR2VecTest, CreateInvalidMode) { + Vocab V = {{"foo", {1.0, 2.0}}}; + + LLVMContext Ctx; + Module M("M", Ctx); + FunctionType *FTy = FunctionType::get(Type::getVoidTy(Ctx), false); + Function *F = Function::Create(FTy, Function::ExternalLinkage, "f", M); + + // static_cast an invalid int to IR2VecKind + auto Result = Embedder::create(static_cast(-1), *F, V); + EXPECT_FALSE(static_cast(Result)); + + std::string ErrMsg; + llvm::handleAllErrors( + Result.takeError(), + [&](const llvm::ErrorInfoBase &EIB) { ErrMsg = EIB.message(); }); + EXPECT_NE(ErrMsg.find("Unknown IR2VecKind"), std::string::npos); +} + TEST(IR2VecTest, LookupVocab) { Vocab V = {{"foo", {1.0, 2.0}}, {"bar", {3.0, 4.0}}}; LLVMContext Ctx; @@ -136,8 +257,9 @@ TEST(IR2VecTest, ZeroDimensionEmbedding) { Embedding E1; Embedding E2; // Should be no-op, but not crash - TestableEmbedder::addVectors(E1, E2); - TestableEmbedder::addScaledVector(E1, E2, 1.0f); + E1 += E2; + E1 -= E2; + E1.scaleAndAdd(E2, 1.0f); EXPECT_TRUE(E1.empty()); } From 3a2bcd96e22721312c9d340c9122a3988dc1e222 Mon Sep 17 00:00:00 2001 From: Philip Reames Date: Tue, 10 Jun 2025 15:26:54 -0700 Subject: [PATCH 012/851] [RISCV][TTI] Allow partial reduce with mismatched extends (#143608) This depends on the recently add partial_reduce_sumla node for lowering but at this point, we have all the parts. --- .../Target/RISCV/RISCVTargetTransformInfo.cpp | 3 +- .../RISCV/partial-reduce-dot-product.ll | 439 ++++++++++++------ 2 files changed, 296 insertions(+), 146 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp index ff822dec232a9..d5ea0c5d52293 100644 --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -305,8 +305,7 @@ InstructionCost RISCVTTIImpl::getPartialReductionCost( if (!ST->hasStdExtZvqdotq() || ST->getELen() < 64 || Opcode != Instruction::Add || !BinOp || *BinOp != Instruction::Mul || InputTypeA != InputTypeB || !InputTypeA->isIntegerTy(8) || - OpAExtend != OpBExtend || !AccumType->isIntegerTy(32) || - !VF.isKnownMultipleOf(4)) + !AccumType->isIntegerTy(32) || !VF.isKnownMultipleOf(4)) return InstructionCost::getInvalid(); Type *Tp = VectorType::get(AccumType, VF.divideCoefficientBy(4)); diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll index 847c4ba0bebfc..8c29da02b813c 100644 --- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll @@ -351,79 +351,153 @@ for.exit: ; preds = %for.body define i32 @vqdotsu(ptr %a, ptr %b) #0 { -; CHECK-LABEL: define i32 @vqdotsu( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD1]] to -; CHECK-NEXT: [[TMP12:%.*]] = mul [[TMP11]], [[TMP8]] -; CHECK-NEXT: [[TMP13]] = add [[TMP12]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: +; V-LABEL: define i32 @vqdotsu( +; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; V-NEXT: entry: +; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; V-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; V-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; V: vector.ph: +; V-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; V-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; V-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; V-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; V-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; V-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; V-NEXT: br label [[VECTOR_BODY:%.*]] +; V: vector.body: +; V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; V-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; V-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; V-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; V-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to +; V-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; V-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0 +; V-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP10]], align 1 +; V-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD1]] to +; V-NEXT: [[TMP12:%.*]] = mul [[TMP11]], [[TMP8]] +; V-NEXT: [[TMP13]] = add [[TMP12]], [[VEC_PHI]] +; V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; V-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; V-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; V: middle.block: +; V-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) +; V-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; V-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; V: scalar.ph: +; +; ZVQDOTQ-LABEL: define i32 @vqdotsu( +; ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; ZVQDOTQ-NEXT: entry: +; ZVQDOTQ-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; ZVQDOTQ-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; ZVQDOTQ-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; ZVQDOTQ: vector.ph: +; ZVQDOTQ-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; ZVQDOTQ-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; ZVQDOTQ-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; ZVQDOTQ-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; ZVQDOTQ-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; ZVQDOTQ-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]] +; ZVQDOTQ: vector.body: +; ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext [[WIDE_LOAD]] to +; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; ZVQDOTQ-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0 +; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP10]], align 1 +; ZVQDOTQ-NEXT: [[TMP11:%.*]] = sext [[WIDE_LOAD1]] to +; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul [[TMP11]], [[TMP8]] +; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32( [[VEC_PHI]], [[TMP12]]) +; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; ZVQDOTQ-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; ZVQDOTQ-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; ZVQDOTQ: middle.block: +; ZVQDOTQ-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv1i32( [[PARTIAL_REDUCE]]) +; ZVQDOTQ-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; ZVQDOTQ-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; ZVQDOTQ: scalar.ph: ; -; FIXED-LABEL: define i32 @vqdotsu( -; FIXED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0:[0-9]+]] { -; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] -; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 -; FIXED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; FIXED-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 -; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 -; FIXED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 -; FIXED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 -; FIXED-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> -; FIXED-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> -; FIXED-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] -; FIXED-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] -; FIXED-NEXT: [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]] -; FIXED-NEXT: [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]] -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] -; FIXED: middle.block: -; FIXED-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]] -; FIXED-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) -; FIXED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; FIXED: scalar.ph: +; FIXED-V-LABEL: define i32 @vqdotsu( +; FIXED-V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; FIXED-V-NEXT: entry: +; FIXED-V-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-V: vector.ph: +; FIXED-V-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-V: vector.body: +; FIXED-V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-V-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-V-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-V-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-V-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-V-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-V-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-V-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; FIXED-V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 +; FIXED-V-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 +; FIXED-V-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 +; FIXED-V-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 +; FIXED-V-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> +; FIXED-V-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-V-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] +; FIXED-V-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] +; FIXED-V-NEXT: [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]] +; FIXED-V-NEXT: [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]] +; FIXED-V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-V-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-V-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FIXED-V: middle.block: +; FIXED-V-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]] +; FIXED-V-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) +; FIXED-V-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED-V: scalar.ph: +; +; FIXED-ZVQDOTQ-LABEL: define i32 @vqdotsu( +; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; FIXED-ZVQDOTQ-NEXT: entry: +; FIXED-ZVQDOTQ-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-ZVQDOTQ: vector.ph: +; FIXED-ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-ZVQDOTQ: vector.body: +; FIXED-ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-ZVQDOTQ-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; FIXED-ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 +; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] +; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]]) +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]]) +; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]] +; FIXED-ZVQDOTQ: middle.block: +; FIXED-ZVQDOTQ-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; FIXED-ZVQDOTQ-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) +; FIXED-ZVQDOTQ-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED-ZVQDOTQ: scalar.ph: ; entry: br label %for.body @@ -448,79 +522,153 @@ for.exit: ; preds = %for.body } define i32 @vqdotsu2(ptr %a, ptr %b) #0 { -; CHECK-LABEL: define i32 @vqdotsu2( -; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 -; CHECK-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] -; CHECK-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; CHECK: vector.ph: -; CHECK-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 -; CHECK-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] -; CHECK-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] -; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() -; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 -; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] -; CHECK: vector.body: -; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 -; CHECK-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD]] to -; CHECK-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0 -; CHECK-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP10]], align 1 -; CHECK-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD1]] to -; CHECK-NEXT: [[TMP12:%.*]] = mul [[TMP11]], [[TMP8]] -; CHECK-NEXT: [[TMP13]] = add [[TMP12]], [[VEC_PHI]] -; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] -; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; CHECK: middle.block: -; CHECK-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) -; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] -; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; CHECK: scalar.ph: +; V-LABEL: define i32 @vqdotsu2( +; V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; V-NEXT: entry: +; V-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; V-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; V-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; V-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; V: vector.ph: +; V-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; V-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; V-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; V-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; V-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; V-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; V-NEXT: br label [[VECTOR_BODY:%.*]] +; V: vector.body: +; V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; V-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; V-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; V-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; V-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD]] to +; V-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; V-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0 +; V-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP10]], align 1 +; V-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD1]] to +; V-NEXT: [[TMP12:%.*]] = mul [[TMP11]], [[TMP8]] +; V-NEXT: [[TMP13]] = add [[TMP12]], [[VEC_PHI]] +; V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; V-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; V-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; V: middle.block: +; V-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32( [[TMP13]]) +; V-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; V-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; V: scalar.ph: +; +; ZVQDOTQ-LABEL: define i32 @vqdotsu2( +; ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; ZVQDOTQ-NEXT: entry: +; ZVQDOTQ-NEXT: [[TMP0:%.*]] = call i64 @llvm.vscale.i64() +; ZVQDOTQ-NEXT: [[TMP1:%.*]] = mul i64 [[TMP0]], 4 +; ZVQDOTQ-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]] +; ZVQDOTQ-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; ZVQDOTQ: vector.ph: +; ZVQDOTQ-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64() +; ZVQDOTQ-NEXT: [[TMP3:%.*]] = mul i64 [[TMP2]], 4 +; ZVQDOTQ-NEXT: [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]] +; ZVQDOTQ-NEXT: [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]] +; ZVQDOTQ-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; ZVQDOTQ-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 4 +; ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]] +; ZVQDOTQ: vector.body: +; ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i32 0 +; ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load , ptr [[TMP7]], align 1 +; ZVQDOTQ-NEXT: [[TMP8:%.*]] = sext [[WIDE_LOAD]] to +; ZVQDOTQ-NEXT: [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; ZVQDOTQ-NEXT: [[TMP10:%.*]] = getelementptr i8, ptr [[TMP9]], i32 0 +; ZVQDOTQ-NEXT: [[WIDE_LOAD1:%.*]] = load , ptr [[TMP10]], align 1 +; ZVQDOTQ-NEXT: [[TMP11:%.*]] = zext [[WIDE_LOAD1]] to +; ZVQDOTQ-NEXT: [[TMP12:%.*]] = mul [[TMP11]], [[TMP8]] +; ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call @llvm.experimental.vector.partial.reduce.add.nxv1i32.nxv4i32( [[VEC_PHI]], [[TMP12]]) +; ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]] +; ZVQDOTQ-NEXT: [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; ZVQDOTQ-NEXT: br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; ZVQDOTQ: middle.block: +; ZVQDOTQ-NEXT: [[TMP14:%.*]] = call i32 @llvm.vector.reduce.add.nxv1i32( [[PARTIAL_REDUCE]]) +; ZVQDOTQ-NEXT: [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]] +; ZVQDOTQ-NEXT: br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; ZVQDOTQ: scalar.ph: ; -; FIXED-LABEL: define i32 @vqdotsu2( -; FIXED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { -; FIXED-NEXT: entry: -; FIXED-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] -; FIXED: vector.ph: -; FIXED-NEXT: br label [[VECTOR_BODY:%.*]] -; FIXED: vector.body: -; FIXED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] -; FIXED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 -; FIXED-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 -; FIXED-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 -; FIXED-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 -; FIXED-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> -; FIXED-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> -; FIXED-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] -; FIXED-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 -; FIXED-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 -; FIXED-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 -; FIXED-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 -; FIXED-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> -; FIXED-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> -; FIXED-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] -; FIXED-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] -; FIXED-NEXT: [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]] -; FIXED-NEXT: [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]] -; FIXED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 -; FIXED-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 -; FIXED-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] -; FIXED: middle.block: -; FIXED-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]] -; FIXED-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) -; FIXED-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] -; FIXED: scalar.ph: +; FIXED-V-LABEL: define i32 @vqdotsu2( +; FIXED-V-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; FIXED-V-NEXT: entry: +; FIXED-V-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-V: vector.ph: +; FIXED-V-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-V: vector.body: +; FIXED-V-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[VEC_PHI:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[VEC_PHI1:%.*]] = phi <8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ] +; FIXED-V-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-V-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-V-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-V-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-V-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-V-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-V-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-V-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; FIXED-V-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 +; FIXED-V-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 +; FIXED-V-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 +; FIXED-V-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 +; FIXED-V-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> +; FIXED-V-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-V-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] +; FIXED-V-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] +; FIXED-V-NEXT: [[TMP12]] = add <8 x i32> [[TMP10]], [[VEC_PHI]] +; FIXED-V-NEXT: [[TMP13]] = add <8 x i32> [[TMP11]], [[VEC_PHI1]] +; FIXED-V-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-V-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-V-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; FIXED-V: middle.block: +; FIXED-V-NEXT: [[BIN_RDX:%.*]] = add <8 x i32> [[TMP13]], [[TMP12]] +; FIXED-V-NEXT: [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v8i32(<8 x i32> [[BIN_RDX]]) +; FIXED-V-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED-V: scalar.ph: +; +; FIXED-ZVQDOTQ-LABEL: define i32 @vqdotsu2( +; FIXED-ZVQDOTQ-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { +; FIXED-ZVQDOTQ-NEXT: entry: +; FIXED-ZVQDOTQ-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; FIXED-ZVQDOTQ: vector.ph: +; FIXED-ZVQDOTQ-NEXT: br label [[VECTOR_BODY:%.*]] +; FIXED-ZVQDOTQ: vector.body: +; FIXED-ZVQDOTQ-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[VEC_PHI1:%.*]] = phi <2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ] +; FIXED-ZVQDOTQ-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; FIXED-ZVQDOTQ-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 0 +; FIXED-ZVQDOTQ-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP1]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]] +; FIXED-ZVQDOTQ-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0 +; FIXED-ZVQDOTQ-NEXT: [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP6]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1 +; FIXED-ZVQDOTQ-NEXT: [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32> +; FIXED-ZVQDOTQ-NEXT: [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]] +; FIXED-ZVQDOTQ-NEXT: [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]] +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]]) +; FIXED-ZVQDOTQ-NEXT: [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.experimental.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]]) +; FIXED-ZVQDOTQ-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16 +; FIXED-ZVQDOTQ-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; FIXED-ZVQDOTQ-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]] +; FIXED-ZVQDOTQ: middle.block: +; FIXED-ZVQDOTQ-NEXT: [[BIN_RDX:%.*]] = add <2 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]] +; FIXED-ZVQDOTQ-NEXT: [[TMP13:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[BIN_RDX]]) +; FIXED-ZVQDOTQ-NEXT: br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]] +; FIXED-ZVQDOTQ: scalar.ph: ; entry: br label %for.body @@ -543,3 +691,6 @@ for.body: ; preds = %for.body, %entry for.exit: ; preds = %for.body ret i32 %add } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} +; FIXED: {{.*}} From c7063380205d8776e281f7a6603119aa8ea28c12 Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Tue, 10 Jun 2025 15:34:54 -0700 Subject: [PATCH 013/851] [lldb] Fix `target stop-hook add` help output The help output for `target stop-hook add` references non-existing option `--one-line-command`. The correct option is `--one-liner`: ``` -o ( --one-liner ) Add a command for the stop hook. Can be specified more than once, and commands will be run in the order they appear. ``` This commit fixes the help text. rdar://152730660 --- lldb/source/Commands/CommandObjectTarget.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp index 21b21954bbc90..a4ced37649ea0 100644 --- a/lldb/source/Commands/CommandObjectTarget.cpp +++ b/lldb/source/Commands/CommandObjectTarget.cpp @@ -4885,9 +4885,9 @@ class CommandObjectTargetStopHookAdd : public CommandObjectParsed, Command Based stop-hooks: ------------------------- Stop hooks can run a list of lldb commands by providing one or more - --one-line-command options. The commands will get run in the order they are - added. Or you can provide no commands, in which case you will enter a - command editor where you can enter the commands to be run. + --one-liner options. The commands will get run in the order they are added. + Or you can provide no commands, in which case you will enter a command editor + where you can enter the commands to be run. Python Based Stop Hooks: ------------------------ From 32d2b6ba4797584743d4764b25af0ae6f6c3d063 Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Tue, 10 Jun 2025 15:58:53 -0700 Subject: [PATCH 014/851] [HWASAN] Disable LSan test on Android (#143625) Android HWASan does not support LSan. --- compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp b/compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp index b6e486b291f3a..91acd28a1a5ff 100644 --- a/compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp +++ b/compiler-rt/test/hwasan/TestCases/Posix/dlerror.cpp @@ -1,6 +1,9 @@ // Make sure dlerror is not classified as a leak even if we use dynamic TLS. // This is currently not implemented, so this test is XFAIL. +// Android HWAsan does not support LSan. +// UNSUPPORTED: android + // RUN: %clangxx_hwasan -O0 %s -o %t && HWASAN_OPTIONS=detect_leaks=1 %run %t #include From 48122a797710a05b5b8620f6051e9716a8e5a6c3 Mon Sep 17 00:00:00 2001 From: Zhen Wang <37195552+wangzpgi@users.noreply.github.com> Date: Tue, 10 Jun 2025 16:15:12 -0700 Subject: [PATCH 015/851] [flang][cuda] Fix CUDA generic resolution for VALUE arguments in device procedures (#140952) For actual arguments that have VALUE attribute inside device routines, treat them as if they have device attribute. --- flang/lib/Semantics/check-call.cpp | 7 +++++++ flang/test/Semantics/cuf21.cuf | 11 +++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp index dfc2ddbacf071..6f2503285013d 100644 --- a/flang/lib/Semantics/check-call.cpp +++ b/flang/lib/Semantics/check-call.cpp @@ -1033,6 +1033,13 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy, *actualDataAttr == common::CUDADataAttr::Managed)) { actualDataAttr = common::CUDADataAttr::Device; } + // For device procedures, treat actual arguments with VALUE attribute as + // device data + if (!actualDataAttr && actualLastSymbol && IsValue(*actualLastSymbol) && + (*procedure.cudaSubprogramAttrs == + common::CUDASubprogramAttrs::Device)) { + actualDataAttr = common::CUDADataAttr::Device; + } } if (dummyDataAttr == common::CUDADataAttr::Device && (dummyIsAssumedShape || dummyIsAssumedRank) && diff --git a/flang/test/Semantics/cuf21.cuf b/flang/test/Semantics/cuf21.cuf index b8b99a8d1d9be..077657c8a52d5 100644 --- a/flang/test/Semantics/cuf21.cuf +++ b/flang/test/Semantics/cuf21.cuf @@ -9,19 +9,22 @@ module mlocModule end interface maxlocUpdate contains - attributes(global) subroutine maxlocPartialMaskR_32F1D() + attributes(global) subroutine maxlocPartialMaskR_32F1D(back) implicit none + logical, intent(in), value :: back real(4) :: mval - call maxlocUpdate(mval) + call maxlocUpdate(mval, back) end subroutine maxlocPartialMaskR_32F1D - attributes(device) subroutine maxlocUpdateR_32F(mval) + attributes(device) subroutine maxlocUpdateR_32F(mval, back) real(4) :: mval + logical :: back end subroutine maxlocUpdateR_32F - attributes(device) subroutine maxlocUpdateR_64F(mval) + attributes(device) subroutine maxlocUpdateR_64F(mval, back) real(8) :: mval + logical :: back end subroutine maxlocUpdateR_64F end module From 1bf4702d2bbaad522886dfbab913a8dd6efe3b85 Mon Sep 17 00:00:00 2001 From: Amy Huang Date: Tue, 10 Jun 2025 16:18:53 -0700 Subject: [PATCH 016/851] Disable prctl test when building for arm or riscv. (#143627) I'm setting up a buildbot for arm32 using qemu and qemu doesn't support PR_GET_THP_DISABLE. Disable the test for now while we figure out what to do about that. Also disable for riscv because we may do the same for riscv buildbots. --- libc/test/src/sys/prctl/linux/CMakeLists.txt | 6 ++++++ libc/test/src/sys/prctl/linux/prctl_test.cpp | 1 + 2 files changed, 7 insertions(+) diff --git a/libc/test/src/sys/prctl/linux/CMakeLists.txt b/libc/test/src/sys/prctl/linux/CMakeLists.txt index b06e1c8087008..d02900e1857a0 100644 --- a/libc/test/src/sys/prctl/linux/CMakeLists.txt +++ b/libc/test/src/sys/prctl/linux/CMakeLists.txt @@ -1,5 +1,10 @@ add_custom_target(libc_sys_prctl_unittests) +# Temporarily disable this test while setting up arm and riscv buildbots +# using qemu, since PR_GET_THP_DISABLE is not supported on qemu. +if (NOT (LIBC_TARGET_ARCHITECTURE_IS_ARM OR + LIBC_TARGET_ARCHITECTURE_IS_RISCV32 OR + LIBC_TARGET_ARCHITECTURE_IS_RISCV64)) add_libc_unittest( prctl_test SUITE @@ -13,3 +18,4 @@ add_libc_unittest( libc.test.UnitTest.ErrnoCheckingTest libc.test.UnitTest.ErrnoSetterMatcher ) +endif() diff --git a/libc/test/src/sys/prctl/linux/prctl_test.cpp b/libc/test/src/sys/prctl/linux/prctl_test.cpp index 374c905e0ef8a..76b829c82d1be 100644 --- a/libc/test/src/sys/prctl/linux/prctl_test.cpp +++ b/libc/test/src/sys/prctl/linux/prctl_test.cpp @@ -34,6 +34,7 @@ TEST_F(LlvmLibcSysPrctlTest, GetSetName) { TEST_F(LlvmLibcSysPrctlTest, GetTHPDisable) { // Manually check errno since the return value logic here is not // covered in ErrnoSetterMatcher. + // Note that PR_GET_THP_DISABLE is not supported by QEMU. int ret = LIBC_NAMESPACE::prctl(PR_GET_THP_DISABLE, 0, 0, 0, 0); ASSERT_ERRNO_SUCCESS(); // PR_GET_THP_DISABLE return (as the function result) the current From ad479ddb343c2756e6eed0f2999bbdb88a65c7c5 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 11 Jun 2025 08:49:13 +0900 Subject: [PATCH 017/851] Revert "[SeparateConstOffsetFromGEP] Decompose constant xor operand if possible (#135788)" This reverts commit 13ccce28776d8ad27b0c6a92b5a452d62da05663. The tests are on non-canonical IR, and adds an extra unrelated pre-processing step to the pass. I'm assuming this is a workaround for the known-bits recursion depth limit in instcombine. --- .../Scalar/SeparateConstOffsetFromGEP.cpp | 193 ----------------- .../AMDGPU/xor-to-or-disjoint.ll | 204 ------------------ 2 files changed, 397 deletions(-) delete mode 100644 llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp index 6fae9f1dd2404..320b79203c0b3 100644 --- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp +++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp @@ -174,7 +174,6 @@ #include "llvm/IR/Function.h" #include "llvm/IR/GetElementPtrTypeIterator.h" #include "llvm/IR/IRBuilder.h" -#include "llvm/IR/InstIterator.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -191,7 +190,6 @@ #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Transforms/Scalar.h" -#include "llvm/Transforms/Utils/BasicBlockUtils.h" #include "llvm/Transforms/Utils/Local.h" #include #include @@ -200,8 +198,6 @@ using namespace llvm; using namespace llvm::PatternMatch; -#define DEBUG_TYPE "separate-offset-gep" - static cl::opt DisableSeparateConstOffsetFromGEP( "disable-separate-const-offset-from-gep", cl::init(false), cl::desc("Do not separate the constant offset from a GEP instruction"), @@ -492,42 +488,6 @@ class SeparateConstOffsetFromGEP { DenseMap> DominatingSubs; }; -/// A helper class that aims to convert xor operations into or operations when -/// their operands are disjoint and the result is used in a GEP's index. This -/// can then enable further GEP optimizations by effectively turning BaseVal | -/// Const into BaseVal + Const when they are disjoint, which -/// SeparateConstOffsetFromGEP can then process. This is a common pattern that -/// sets up a grid of memory accesses across a wave where each thread acesses -/// data at various offsets. -class XorToOrDisjointTransformer { -public: - XorToOrDisjointTransformer(Function &F, DominatorTree &DT, - const DataLayout &DL) - : F(F), DT(DT), DL(DL) {} - - bool run(); - -private: - Function &F; - DominatorTree &DT; - const DataLayout &DL; - /// Maps a common operand to all Xor instructions - using XorOpList = SmallVector, 8>; - using XorBaseValInst = DenseMap; - XorBaseValInst XorGroups; - - /// Checks if the given value has at least one GetElementPtr user - static bool hasGEPUser(const Value *V); - - /// Helper function to check if BaseXor dominates all XORs in the group - bool dominatesAllXors(BinaryOperator *BaseXor, const XorOpList &XorsInGroup); - - /// Processes a group of XOR instructions that share the same non-constant - /// base operand. Returns true if this group's processing modified the - /// function. - bool processXorGroup(Instruction *OriginalBaseInst, XorOpList &XorsInGroup); -}; - } // end anonymous namespace char SeparateConstOffsetFromGEPLegacyPass::ID = 0; @@ -1263,154 +1223,6 @@ bool SeparateConstOffsetFromGEP::splitGEP(GetElementPtrInst *GEP) { return true; } -// Helper function to check if an instruction has at least one GEP user -bool XorToOrDisjointTransformer::hasGEPUser(const Value *V) { - return llvm::any_of(V->users(), [](const User *U) { - return isa(U); - }); -} - -bool XorToOrDisjointTransformer::dominatesAllXors( - BinaryOperator *BaseXor, const XorOpList &XorsInGroup) { - return llvm::all_of(XorsInGroup, [&](const auto &XorEntry) { - BinaryOperator *XorInst = XorEntry.first; - // Do not evaluate the BaseXor, otherwise we end up cloning it. - return XorInst == BaseXor || DT.dominates(BaseXor, XorInst); - }); -} - -bool XorToOrDisjointTransformer::processXorGroup(Instruction *OriginalBaseInst, - XorOpList &XorsInGroup) { - bool Changed = false; - if (XorsInGroup.size() <= 1) - return false; - - // Sort XorsInGroup by the constant offset value in increasing order. - llvm::sort(XorsInGroup, [](const auto &A, const auto &B) { - return A.second.slt(B.second); - }); - - // Dominance check - // The "base" XOR for dominance purposes is the one with the smallest - // constant. - BinaryOperator *XorWithSmallConst = XorsInGroup[0].first; - - if (!dominatesAllXors(XorWithSmallConst, XorsInGroup)) { - LLVM_DEBUG(dbgs() << DEBUG_TYPE - << ": Cloning and inserting XOR with smallest constant (" - << *XorWithSmallConst - << ") as it does not dominate all other XORs" - << " in function " << F.getName() << "\n"); - - BinaryOperator *ClonedXor = - cast(XorWithSmallConst->clone()); - ClonedXor->setName(XorWithSmallConst->getName() + ".dom_clone"); - ClonedXor->insertAfter(OriginalBaseInst); - LLVM_DEBUG(dbgs() << " Cloned Inst: " << *ClonedXor << "\n"); - Changed = true; - XorWithSmallConst = ClonedXor; - } - - SmallVector InstructionsToErase; - const APInt SmallestConst = - cast(XorWithSmallConst->getOperand(1))->getValue(); - - // Main transformation loop: Iterate over the original XORs in the sorted - // group. - for (const auto &XorEntry : XorsInGroup) { - BinaryOperator *XorInst = XorEntry.first; // Original XOR instruction - const APInt ConstOffsetVal = XorEntry.second; - - // Do not process the one with smallest constant as it is the base. - if (XorInst == XorWithSmallConst) - continue; - - // Disjointness Check 1 - APInt NewConstVal = ConstOffsetVal - SmallestConst; - if ((NewConstVal & SmallestConst) != 0) { - LLVM_DEBUG(dbgs() << DEBUG_TYPE << ": Cannot transform XOR in function " - << F.getName() << ":\n" - << " New Const: " << NewConstVal - << " Smallest Const: " << SmallestConst - << " are not disjoint \n"); - continue; - } - - // Disjointness Check 2 - if (MaskedValueIsZero(XorWithSmallConst, NewConstVal, SimplifyQuery(DL), - 0)) { - LLVM_DEBUG(dbgs() << DEBUG_TYPE - << ": Transforming XOR to OR (disjoint) in function " - << F.getName() << ":\n" - << " Xor: " << *XorInst << "\n" - << " Base Val: " << *XorWithSmallConst << "\n" - << " New Const: " << NewConstVal << "\n"); - - auto *NewOrInst = BinaryOperator::CreateDisjointOr( - XorWithSmallConst, - ConstantInt::get(OriginalBaseInst->getType(), NewConstVal), - XorInst->getName() + ".or_disjoint", XorInst->getIterator()); - - NewOrInst->copyMetadata(*XorInst); - XorInst->replaceAllUsesWith(NewOrInst); - LLVM_DEBUG(dbgs() << " New Inst: " << *NewOrInst << "\n"); - InstructionsToErase.push_back(XorInst); // Mark original XOR for deletion - - Changed = true; - } else { - LLVM_DEBUG( - dbgs() << DEBUG_TYPE - << ": Cannot transform XOR (not proven disjoint) in function " - << F.getName() << ":\n" - << " Xor: " << *XorInst << "\n" - << " Base Val: " << *XorWithSmallConst << "\n" - << " New Const: " << NewConstVal << "\n"); - } - } - - for (Instruction *I : InstructionsToErase) - I->eraseFromParent(); - - return Changed; -} - -// Try to transform XOR(A, B+C) in to XOR(A,C) + B where XOR(A,C) becomes -// the base for memory operations. This transformation is true under the -// following conditions -// Check 1 - B and C are disjoint. -// Check 2 - XOR(A,C) and B are disjoint. -// -// This transformation is beneficial particularly for GEPs because: -// 1. OR operations often map better to addressing modes than XOR -// 2. Disjoint OR operations preserve the semantics of the original XOR -// 3. This can enable further optimizations in the GEP offset folding pipeline -bool XorToOrDisjointTransformer::run() { - bool Changed = false; - - // Collect all candidate XORs - for (Instruction &I : instructions(F)) { - Instruction *Op0 = nullptr; - ConstantInt *C1 = nullptr; - BinaryOperator *MatchedXorOp = nullptr; - - // Attempt to match the instruction 'I' as XOR operation. - if (match(&I, m_CombineAnd(m_Xor(m_Instruction(Op0), m_ConstantInt(C1)), - m_BinOp(MatchedXorOp))) && - hasGEPUser(MatchedXorOp)) - XorGroups[Op0].emplace_back(MatchedXorOp, C1->getValue()); - } - - if (XorGroups.empty()) - return false; - - // Process each group of XORs - for (auto &[OriginalBaseInst, XorsInGroup] : XorGroups) - if (processXorGroup(OriginalBaseInst, XorsInGroup)) - Changed = true; - - return Changed; -} - bool SeparateConstOffsetFromGEPLegacyPass::runOnFunction(Function &F) { if (skipFunction(F)) return false; @@ -1430,11 +1242,6 @@ bool SeparateConstOffsetFromGEP::run(Function &F) { DL = &F.getDataLayout(); bool Changed = false; - - // Decompose xor in to "or disjoint" if possible. - XorToOrDisjointTransformer XorTransformer(F, *DT, *DL); - Changed |= XorTransformer.run(); - for (BasicBlock &B : F) { if (!DT->isReachableFromEntry(&B)) continue; diff --git a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll b/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll deleted file mode 100644 index 825227292fe14..0000000000000 --- a/llvm/test/Transforms/SeparateConstOffsetFromGEP/AMDGPU/xor-to-or-disjoint.ll +++ /dev/null @@ -1,204 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt -mtriple=amdgcn-amd-amdhsa -passes=separate-const-offset-from-gep \ -; RUN: -S < %s | FileCheck %s - - -; Test a simple case of xor to or disjoint transformation -define half @test_basic_transformation(ptr %ptr, i64 %input) { -; CHECK-LABEL: define half @test_basic_transformation( -; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[BASE:%.*]] = and i64 [[INPUT]], -8192 -; CHECK-NEXT: [[ADDR1:%.*]] = xor i64 [[BASE]], 32 -; CHECK-NEXT: [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048 -; CHECK-NEXT: [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 4096 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]] -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]] -; CHECK-NEXT: [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2 -; CHECK-NEXT: [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2 -; CHECK-NEXT: [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2 -; CHECK-NEXT: [[VAL1_F:%.*]] = fpext half [[VAL1]] to float -; CHECK-NEXT: [[VAL2_F:%.*]] = fpext half [[VAL2]] to float -; CHECK-NEXT: [[VAL3_F:%.*]] = fpext half [[VAL3]] to float -; CHECK-NEXT: [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]] -; CHECK-NEXT: [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]] -; CHECK-NEXT: [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half -; CHECK-NEXT: ret half [[RESULT_H]] -; -entry: - %base = and i64 %input, -8192 ; Clear low bits - %addr1 = xor i64 %base, 32 - %addr2 = xor i64 %base, 2080 - %addr3 = xor i64 %base, 4128 - %gep1 = getelementptr i8, ptr %ptr, i64 %addr1 - %gep2 = getelementptr i8, ptr %ptr, i64 %addr2 - %gep3 = getelementptr i8, ptr %ptr, i64 %addr3 - %val1 = load half, ptr %gep1 - %val2 = load half, ptr %gep2 - %val3 = load half, ptr %gep3 - %val1.f = fpext half %val1 to float - %val2.f = fpext half %val2 to float - %val3.f = fpext half %val3 to float - %sum1.f = fadd float %val1.f, %val2.f - %sum_total.f = fadd float %sum1.f, %val3.f - %result.h = fptrunc float %sum_total.f to half - ret half %result.h -} - - -; Test the decreasing order of offset xor to or disjoint transformation -define half @test_descending_offset_transformation(ptr %ptr, i64 %input) { -; CHECK-LABEL: define half @test_descending_offset_transformation( -; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[BASE:%.*]] = and i64 [[INPUT]], -8192 -; CHECK-NEXT: [[ADDR3_DOM_CLONE:%.*]] = xor i64 [[BASE]], 32 -; CHECK-NEXT: [[ADDR1_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 4096 -; CHECK-NEXT: [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 2048 -; CHECK-NEXT: [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR3_DOM_CLONE]], 0 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1_OR_DISJOINT]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]] -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]] -; CHECK-NEXT: [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2 -; CHECK-NEXT: [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2 -; CHECK-NEXT: [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2 -; CHECK-NEXT: [[VAL1_F:%.*]] = fpext half [[VAL1]] to float -; CHECK-NEXT: [[VAL2_F:%.*]] = fpext half [[VAL2]] to float -; CHECK-NEXT: [[VAL3_F:%.*]] = fpext half [[VAL3]] to float -; CHECK-NEXT: [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]] -; CHECK-NEXT: [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]] -; CHECK-NEXT: [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half -; CHECK-NEXT: ret half [[RESULT_H]] -; -entry: - %base = and i64 %input, -8192 ; Clear low bits - %addr1 = xor i64 %base, 4128 - %addr2 = xor i64 %base, 2080 - %addr3 = xor i64 %base, 32 - %gep1 = getelementptr i8, ptr %ptr, i64 %addr1 - %gep2 = getelementptr i8, ptr %ptr, i64 %addr2 - %gep3 = getelementptr i8, ptr %ptr, i64 %addr3 - %val1 = load half, ptr %gep1 - %val2 = load half, ptr %gep2 - %val3 = load half, ptr %gep3 - %val1.f = fpext half %val1 to float - %val2.f = fpext half %val2 to float - %val3.f = fpext half %val3 to float - %sum1.f = fadd float %val1.f, %val2.f - %sum_total.f = fadd float %sum1.f, %val3.f - %result.h = fptrunc float %sum_total.f to half - ret half %result.h -} - - -; Test that %addr2 is not transformed to or disjoint. -define half @test_no_transfomation(ptr %ptr, i64 %input) { -; CHECK-LABEL: define half @test_no_transfomation( -; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[BASE:%.*]] = and i64 [[INPUT]], -8192 -; CHECK-NEXT: [[ADDR1:%.*]] = xor i64 [[BASE]], 32 -; CHECK-NEXT: [[ADDR2:%.*]] = xor i64 [[BASE]], 64 -; CHECK-NEXT: [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 2048 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2]] -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]] -; CHECK-NEXT: [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2 -; CHECK-NEXT: [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2 -; CHECK-NEXT: [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2 -; CHECK-NEXT: [[VAL1_F:%.*]] = fpext half [[VAL1]] to float -; CHECK-NEXT: [[VAL2_F:%.*]] = fpext half [[VAL2]] to float -; CHECK-NEXT: [[VAL3_F:%.*]] = fpext half [[VAL3]] to float -; CHECK-NEXT: [[SUM1_F:%.*]] = fadd float [[VAL1_F]], [[VAL2_F]] -; CHECK-NEXT: [[SUM_TOTAL_F:%.*]] = fadd float [[SUM1_F]], [[VAL3_F]] -; CHECK-NEXT: [[RESULT_H:%.*]] = fptrunc float [[SUM_TOTAL_F]] to half -; CHECK-NEXT: ret half [[RESULT_H]] -; -entry: - %base = and i64 %input, -8192 ; Clear low bits - %addr1 = xor i64 %base, 32 - %addr2 = xor i64 %base, 64 ; Should not be transformed - %addr3 = xor i64 %base, 2080 - %gep1 = getelementptr i8, ptr %ptr, i64 %addr1 - %gep2 = getelementptr i8, ptr %ptr, i64 %addr2 - %gep3 = getelementptr i8, ptr %ptr, i64 %addr3 - %val1 = load half, ptr %gep1 - %val2 = load half, ptr %gep2 - %val3 = load half, ptr %gep3 - %val1.f = fpext half %val1 to float - %val2.f = fpext half %val2 to float - %val3.f = fpext half %val3 to float - %sum1.f = fadd float %val1.f, %val2.f - %sum_total.f = fadd float %sum1.f, %val3.f - %result.h = fptrunc float %sum_total.f to half - ret half %result.h -} - - -; Test case with xor instructions in different basic blocks -define half @test_dom_tree(ptr %ptr, i64 %input, i1 %cond) { -; CHECK-LABEL: define half @test_dom_tree( -; CHECK-SAME: ptr [[PTR:%.*]], i64 [[INPUT:%.*]], i1 [[COND:%.*]]) { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[BASE:%.*]] = and i64 [[INPUT]], -8192 -; CHECK-NEXT: [[ADDR1:%.*]] = xor i64 [[BASE]], 16 -; CHECK-NEXT: [[GEP1:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR1]] -; CHECK-NEXT: [[VAL1:%.*]] = load half, ptr [[GEP1]], align 2 -; CHECK-NEXT: br i1 [[COND]], label %[[THEN:.*]], label %[[ELSE:.*]] -; CHECK: [[THEN]]: -; CHECK-NEXT: [[ADDR2_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 32 -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR2_OR_DISJOINT]] -; CHECK-NEXT: [[VAL2:%.*]] = load half, ptr [[GEP2]], align 2 -; CHECK-NEXT: br label %[[MERGE:.*]] -; CHECK: [[ELSE]]: -; CHECK-NEXT: [[ADDR3_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 96 -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR3_OR_DISJOINT]] -; CHECK-NEXT: [[VAL3:%.*]] = load half, ptr [[GEP3]], align 2 -; CHECK-NEXT: br label %[[MERGE]] -; CHECK: [[MERGE]]: -; CHECK-NEXT: [[VAL_FROM_BRANCH:%.*]] = phi half [ [[VAL2]], %[[THEN]] ], [ [[VAL3]], %[[ELSE]] ] -; CHECK-NEXT: [[ADDR4_OR_DISJOINT:%.*]] = or disjoint i64 [[ADDR1]], 224 -; CHECK-NEXT: [[GEP4:%.*]] = getelementptr i8, ptr [[PTR]], i64 [[ADDR4_OR_DISJOINT]] -; CHECK-NEXT: [[VAL4:%.*]] = load half, ptr [[GEP4]], align 2 -; CHECK-NEXT: [[VAL1_F:%.*]] = fpext half [[VAL1]] to float -; CHECK-NEXT: [[VAL_FROM_BRANCH_F:%.*]] = fpext half [[VAL_FROM_BRANCH]] to float -; CHECK-NEXT: [[VAL4_F:%.*]] = fpext half [[VAL4]] to float -; CHECK-NEXT: [[SUM_INTERMEDIATE_F:%.*]] = fadd float [[VAL1_F]], [[VAL_FROM_BRANCH_F]] -; CHECK-NEXT: [[FINAL_SUM_F:%.*]] = fadd float [[SUM_INTERMEDIATE_F]], [[VAL4_F]] -; CHECK-NEXT: [[RESULT_H:%.*]] = fptrunc float [[FINAL_SUM_F]] to half -; CHECK-NEXT: ret half [[RESULT_H]] -; -entry: - %base = and i64 %input, -8192 ; Clear low bits - %addr1 = xor i64 %base,16 - %gep1 = getelementptr i8, ptr %ptr, i64 %addr1 - %val1 = load half, ptr %gep1 - br i1 %cond, label %then, label %else - -then: - %addr2 = xor i64 %base, 48 - %gep2 = getelementptr i8, ptr %ptr, i64 %addr2 - %val2 = load half, ptr %gep2 - br label %merge - -else: - %addr3 = xor i64 %base, 112 - %gep3 = getelementptr i8, ptr %ptr, i64 %addr3 - %val3 = load half, ptr %gep3 - br label %merge - -merge: - %val_from_branch = phi half [ %val2, %then ], [ %val3, %else ] - %addr4 = xor i64 %base, 240 - %gep4 = getelementptr i8, ptr %ptr, i64 %addr4 - %val4 = load half, ptr %gep4 - %val1.f = fpext half %val1 to float - %val_from_branch.f = fpext half %val_from_branch to float - %val4.f = fpext half %val4 to float - %sum_intermediate.f = fadd float %val1.f, %val_from_branch.f - %final_sum.f = fadd float %sum_intermediate.f, %val4.f - %result.h = fptrunc float %final_sum.f to half - ret half %result.h -} - From b9329fe88e47741d9c20ab92f892ac52457e6195 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Tue, 10 Jun 2025 16:50:29 -0700 Subject: [PATCH 018/851] [CIR] Upstream support for calling constructors (#143579) This change adds support for calling C++ constructors. The support for actually defining a constructor is still missing and will be added in a later change. --- clang/include/clang/CIR/MissingFeatures.h | 3 + clang/lib/CIR/CodeGen/CIRGenCall.cpp | 99 +++++++++++++++++-- clang/lib/CIR/CodeGen/CIRGenClass.cpp | 74 ++++++++++++++ clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 51 ++++++++++ clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp | 6 ++ clang/lib/CIR/CodeGen/CIRGenFunction.h | 13 +++ clang/lib/CIR/CodeGen/CIRGenModule.cpp | 54 +++++++++- clang/lib/CIR/CodeGen/CIRGenModule.h | 19 ++++ clang/lib/CIR/CodeGen/CIRGenTypes.h | 6 ++ clang/test/CIR/CodeGen/ctor.cpp | 19 ++++ 10 files changed, 336 insertions(+), 8 deletions(-) create mode 100644 clang/test/CIR/CodeGen/ctor.cpp diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index 72d882beb2244..f89d386378e51 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -222,6 +222,9 @@ struct MissingFeatures { static bool instrumentation() { return false; } static bool cleanupAfterErrorDiags() { return false; } static bool cxxRecordStaticMembers() { return false; } + static bool isMemcpyEquivalentSpecialMember() { return false; } + static bool isTrivialCtorOrDtor() { return false; } + static bool implicitConstructorArgs() { return false; } // Missing types static bool dataMemberType() { return false; } diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp index b194a8670bfb9..9d25eea9e413d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp @@ -60,6 +60,13 @@ CIRGenCallee CIRGenCallee::prepareConcreteCallee(CIRGenFunction &cgf) const { return *this; } +/// Returns the canonical formal type of the given C++ method. +static CanQual getFormalType(const CXXMethodDecl *md) { + return md->getType() + ->getCanonicalTypeUnqualified() + .getAs(); +} + /// Adds the formal parameters in FPT to the given prefix. If any parameter in /// FPT has pass_object_size_attrs, then we'll add parameters for those, too. /// TODO(cir): this should be shared with LLVM codegen @@ -76,6 +83,48 @@ static void appendParameterTypes(const CIRGenTypes &cgt, cgt.getCGModule().errorNYI("appendParameterTypes: hasExtParameterInfos"); } +const CIRGenFunctionInfo & +CIRGenTypes::arrangeCXXStructorDeclaration(GlobalDecl gd) { + auto *md = cast(gd.getDecl()); + + llvm::SmallVector argTypes; + argTypes.push_back(deriveThisType(md->getParent(), md)); + + bool passParams = true; + + if (auto *cd = dyn_cast(md)) { + // A base class inheriting constructor doesn't get forwarded arguments + // needed to construct a virtual base (or base class thereof) + if (cd->getInheritedConstructor()) + cgm.errorNYI(cd->getSourceRange(), + "arrangeCXXStructorDeclaration: inheriting constructor"); + } + + CanQual fpt = getFormalType(md); + + if (passParams) + appendParameterTypes(*this, argTypes, fpt); + + assert(!cir::MissingFeatures::implicitConstructorArgs()); + + RequiredArgs required = + (passParams && md->isVariadic() ? RequiredArgs(argTypes.size()) + : RequiredArgs::All); + + CanQualType resultType = theCXXABI.hasThisReturn(gd) ? argTypes.front() + : theCXXABI.hasMostDerivedReturn(gd) + ? astContext.VoidPtrTy + : astContext.VoidTy; + + assert(!theCXXABI.hasThisReturn(gd) && + "Please send PR with a test and remove this"); + + assert(!cir::MissingFeatures::opCallCIRGenFuncInfoExtParamInfo()); + assert(!cir::MissingFeatures::opCallFnInfoOpts()); + + return arrangeCIRFunctionInfo(resultType, argTypes, required); +} + /// Derives the 'this' type for CIRGen purposes, i.e. ignoring method CVR /// qualification. Either or both of `rd` and `md` may be null. A null `rd` /// indicates that there is no meaningful 'this' type, and a null `md` can occur @@ -103,13 +152,13 @@ CanQualType CIRGenTypes::deriveThisType(const CXXRecordDecl *rd, /// top of any implicit parameters already stored. static const CIRGenFunctionInfo & arrangeCIRFunctionInfo(CIRGenTypes &cgt, SmallVectorImpl &prefix, - CanQual ftp) { + CanQual fpt) { assert(!cir::MissingFeatures::opCallFnInfoOpts()); RequiredArgs required = - RequiredArgs::getFromProtoWithExtraSlots(ftp, prefix.size()); + RequiredArgs::getFromProtoWithExtraSlots(fpt, prefix.size()); assert(!cir::MissingFeatures::opCallExtParameterInfo()); - appendParameterTypes(cgt, prefix, ftp); - CanQualType resultType = ftp->getReturnType().getUnqualifiedType(); + appendParameterTypes(cgt, prefix, fpt); + CanQualType resultType = fpt->getReturnType().getUnqualifiedType(); return cgt.arrangeCIRFunctionInfo(resultType, prefix, required); } @@ -141,6 +190,44 @@ arrangeFreeFunctionLikeCall(CIRGenTypes &cgt, CIRGenModule &cgm, return cgt.arrangeCIRFunctionInfo(retType, argTypes, required); } +/// Arrange a call to a C++ method, passing the given arguments. +/// +/// passProtoArgs indicates whether `args` has args for the parameters in the +/// given CXXConstructorDecl. +const CIRGenFunctionInfo &CIRGenTypes::arrangeCXXConstructorCall( + const CallArgList &args, const CXXConstructorDecl *d, CXXCtorType ctorKind, + bool passProtoArgs) { + + // FIXME: Kill copy. + llvm::SmallVector argTypes; + for (const auto &arg : args) + argTypes.push_back(astContext.getCanonicalParamType(arg.ty)); + + assert(!cir::MissingFeatures::implicitConstructorArgs()); + // +1 for implicit this, which should always be args[0] + unsigned totalPrefixArgs = 1; + + CanQual fpt = getFormalType(d); + RequiredArgs required = + passProtoArgs + ? RequiredArgs::getFromProtoWithExtraSlots(fpt, totalPrefixArgs) + : RequiredArgs::All; + + GlobalDecl gd(d, ctorKind); + if (theCXXABI.hasThisReturn(gd)) + cgm.errorNYI(d->getSourceRange(), + "arrangeCXXConstructorCall: hasThisReturn"); + if (theCXXABI.hasMostDerivedReturn(gd)) + cgm.errorNYI(d->getSourceRange(), + "arrangeCXXConstructorCall: hasMostDerivedReturn"); + CanQualType resultType = astContext.VoidTy; + + assert(!cir::MissingFeatures::opCallFnInfoOpts()); + assert(!cir::MissingFeatures::opCallCIRGenFuncInfoExtParamInfo()); + + return arrangeCIRFunctionInfo(resultType, argTypes, required); +} + /// Arrange a call to a C++ method, passing the given arguments. /// /// numPrefixArgs is the number of the ABI-specific prefix arguments we have. It @@ -198,7 +285,7 @@ CIRGenTypes::arrangeCXXMethodDeclaration(const CXXMethodDecl *md) { /// constructor or destructor. const CIRGenFunctionInfo & CIRGenTypes::arrangeCXXMethodType(const CXXRecordDecl *rd, - const FunctionProtoType *ftp, + const FunctionProtoType *fpt, const CXXMethodDecl *md) { llvm::SmallVector argTypes; @@ -208,7 +295,7 @@ CIRGenTypes::arrangeCXXMethodType(const CXXRecordDecl *rd, assert(!cir::MissingFeatures::opCallFnInfoOpts()); return ::arrangeCIRFunctionInfo( *this, argTypes, - ftp->getCanonicalTypeUnqualified().getAs()); + fpt->getCanonicalTypeUnqualified().getAs()); } /// Arrange the argument and result information for the declaration or diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp index 4cdaa480121dd..8491a66ea6cb4 100644 --- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp @@ -10,9 +10,12 @@ // //===----------------------------------------------------------------------===// +#include "CIRGenCXXABI.h" #include "CIRGenFunction.h" +#include "clang/AST/ExprCXX.h" #include "clang/AST/RecordLayout.h" +#include "clang/AST/Type.h" #include "clang/CIR/MissingFeatures.h" using namespace clang; @@ -63,3 +66,74 @@ Address CIRGenFunction::getAddressOfBaseClass( return value; } + +void CIRGenFunction::emitCXXConstructorCall(const clang::CXXConstructorDecl *d, + clang::CXXCtorType type, + bool forVirtualBase, + bool delegating, + AggValueSlot thisAVS, + const clang::CXXConstructExpr *e) { + CallArgList args; + Address thisAddr = thisAVS.getAddress(); + QualType thisType = d->getThisType(); + mlir::Value thisPtr = thisAddr.getPointer(); + + assert(!cir::MissingFeatures::addressSpace()); + + args.add(RValue::get(thisPtr), thisType); + + // In LLVM Codegen: If this is a trivial constructor, just emit what's needed. + // If this is a union copy constructor, we must emit a memcpy, because the AST + // does not model that copy. + assert(!cir::MissingFeatures::isMemcpyEquivalentSpecialMember()); + + const FunctionProtoType *fpt = d->getType()->castAs(); + + assert(!cir::MissingFeatures::opCallArgEvaluationOrder()); + + emitCallArgs(args, fpt, e->arguments(), e->getConstructor(), + /*ParamsToSkip=*/0); + + assert(!cir::MissingFeatures::sanitizers()); + emitCXXConstructorCall(d, type, forVirtualBase, delegating, thisAddr, args, + e->getExprLoc()); +} + +void CIRGenFunction::emitCXXConstructorCall( + const CXXConstructorDecl *d, CXXCtorType type, bool forVirtualBase, + bool delegating, Address thisAddr, CallArgList &args, SourceLocation loc) { + + const CXXRecordDecl *crd = d->getParent(); + + // If this is a call to a trivial default constructor: + // In LLVM: do nothing. + // In CIR: emit as a regular call, other later passes should lower the + // ctor call into trivial initialization. + assert(!cir::MissingFeatures::isTrivialCtorOrDtor()); + + assert(!cir::MissingFeatures::isMemcpyEquivalentSpecialMember()); + + bool passPrototypeArgs = true; + + // Check whether we can actually emit the constructor before trying to do so. + if (d->getInheritedConstructor()) { + cgm.errorNYI(d->getSourceRange(), + "emitCXXConstructorCall: inherited constructor"); + return; + } + + // Insert any ABI-specific implicit constructor arguments. + assert(!cir::MissingFeatures::implicitConstructorArgs()); + + // Emit the call. + auto calleePtr = cgm.getAddrOfCXXStructor(GlobalDecl(d, type)); + const CIRGenFunctionInfo &info = cgm.getTypes().arrangeCXXConstructorCall( + args, d, type, passPrototypeArgs); + CIRGenCallee callee = CIRGenCallee::forDirect(calleePtr, GlobalDecl(d, type)); + cir::CIRCallOpInterface c; + emitCall(info, callee, ReturnValueSlot(), args, &c, getLoc(loc)); + + if (cgm.getCodeGenOpts().OptimizationLevel != 0 && !crd->isDynamicClass() && + type != Ctor_Base && cgm.getCodeGenOpts().StrictVTablePointers) + cgm.errorNYI(d->getSourceRange(), "vtable assumption loads"); +} diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index 8129fe0ad7db7..f2c2de7a4f59d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -1393,6 +1393,57 @@ RValue CIRGenFunction::emitCXXMemberCallExpr(const CXXMemberCallExpr *ce, ce, md, returnValue, hasQualifier, qualifier, isArrow, base); } +void CIRGenFunction::emitCXXConstructExpr(const CXXConstructExpr *e, + AggValueSlot dest) { + assert(!dest.isIgnored() && "Must have a destination!"); + const CXXConstructorDecl *cd = e->getConstructor(); + + // If we require zero initialization before (or instead of) calling the + // constructor, as can be the case with a non-user-provided default + // constructor, emit the zero initialization now, unless destination is + // already zeroed. + if (e->requiresZeroInitialization() && !dest.isZeroed()) { + cgm.errorNYI(e->getSourceRange(), + "emitCXXConstructExpr: requires initialization"); + return; + } + + // If this is a call to a trivial default constructor: + // In LLVM: do nothing. + // In CIR: emit as a regular call, other later passes should lower the + // ctor call into trivial initialization. + + // Elide the constructor if we're constructing from a temporary + if (getLangOpts().ElideConstructors && e->isElidable()) { + cgm.errorNYI(e->getSourceRange(), + "emitCXXConstructExpr: elidable constructor"); + return; + } + + if (getContext().getAsArrayType(e->getType())) { + cgm.errorNYI(e->getSourceRange(), "emitCXXConstructExpr: array type"); + return; + } + + clang::CXXCtorType type = Ctor_Complete; + bool forVirtualBase = false; + bool delegating = false; + + switch (e->getConstructionKind()) { + case CXXConstructionKind::Complete: + type = Ctor_Complete; + break; + case CXXConstructionKind::Delegating: + case CXXConstructionKind::VirtualBase: + case CXXConstructionKind::NonVirtualBase: + cgm.errorNYI(e->getSourceRange(), + "emitCXXConstructExpr: other construction kind"); + return; + } + + emitCXXConstructorCall(cd, type, forVirtualBase, delegating, dest, e); +} + RValue CIRGenFunction::emitReferenceBindingToExpr(const Expr *e) { // Emit the expression as an lvalue. LValue lv = emitLValue(e); diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp index 56d7ea3884ba7..f1df1b79fc48e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp @@ -51,6 +51,7 @@ class AggExprEmitter : public StmtVisitor { void Visit(Expr *e) { StmtVisitor::Visit(e); } void VisitInitListExpr(InitListExpr *e); + void VisitCXXConstructExpr(const CXXConstructExpr *e); void visitCXXParenListOrInitListExpr(Expr *e, ArrayRef args, FieldDecl *initializedFieldInUnion, @@ -213,6 +214,11 @@ void AggExprEmitter::emitInitializationToLValue(Expr *e, LValue lv) { } } +void AggExprEmitter::VisitCXXConstructExpr(const CXXConstructExpr *e) { + AggValueSlot slot = ensureSlot(cgf.getLoc(e->getSourceRange()), e->getType()); + cgf.emitCXXConstructExpr(e, slot); +} + void AggExprEmitter::emitNullInitializationToLValue(mlir::Location loc, LValue lv) { const QualType type = lv.getType(); diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index d6002c3e4d4d9..7db7f6928fd8f 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -744,6 +744,19 @@ class CIRGenFunction : public CIRGenTypeCache { mlir::LogicalResult emitContinueStmt(const clang::ContinueStmt &s); + void emitCXXConstructExpr(const clang::CXXConstructExpr *e, + AggValueSlot dest); + + void emitCXXConstructorCall(const clang::CXXConstructorDecl *d, + clang::CXXCtorType type, bool forVirtualBase, + bool delegating, AggValueSlot thisAVS, + const clang::CXXConstructExpr *e); + + void emitCXXConstructorCall(const clang::CXXConstructorDecl *d, + clang::CXXCtorType type, bool forVirtualBase, + bool delegating, Address thisAddr, + CallArgList &args, clang::SourceLocation loc); + mlir::LogicalResult emitCXXForRangeStmt(const CXXForRangeStmt &s, llvm::ArrayRef attrs); diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp index 3d46c44b4f1ec..8407f8fad06ba 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp @@ -103,6 +103,25 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext, CIRGenModule::~CIRGenModule() = default; +/// FIXME: this could likely be a common helper and not necessarily related +/// with codegen. +/// Return the best known alignment for an unknown pointer to a +/// particular class. +CharUnits CIRGenModule::getClassPointerAlignment(const CXXRecordDecl *rd) { + if (!rd->hasDefinition()) + return CharUnits::One(); // Hopefully won't be used anywhere. + + auto &layout = astContext.getASTRecordLayout(rd); + + // If the class is final, then we know that the pointer points to an + // object of that type and can use the full alignment. + if (rd->isEffectivelyFinal()) + return layout.getAlignment(); + + // Otherwise, we have to assume it could be a subclass. + return layout.getNonVirtualAlignment(); +} + CharUnits CIRGenModule::getNaturalTypeAlignment(QualType t, LValueBaseInfo *baseInfo) { assert(!cir::MissingFeatures::opTBAA()); @@ -1174,6 +1193,34 @@ void CIRGenModule::setInitializer(cir::GlobalOp &op, mlir::Attribute value) { assert(!cir::MissingFeatures::opGlobalVisibility()); } +std::pair CIRGenModule::getAddrAndTypeOfCXXStructor( + GlobalDecl gd, const CIRGenFunctionInfo *fnInfo, cir::FuncType fnType, + bool dontDefer, ForDefinition_t isForDefinition) { + auto *md = cast(gd.getDecl()); + + if (isa(md)) { + // Always alias equivalent complete destructors to base destructors in the + // MS ABI. + if (getTarget().getCXXABI().isMicrosoft() && + gd.getDtorType() == Dtor_Complete && + md->getParent()->getNumVBases() == 0) + errorNYI(md->getSourceRange(), + "getAddrAndTypeOfCXXStructor: MS ABI complete destructor"); + } + + if (!fnType) { + if (!fnInfo) + fnInfo = &getTypes().arrangeCXXStructorDeclaration(gd); + fnType = getTypes().getFunctionType(*fnInfo); + } + + auto fn = getOrCreateCIRFunction(getMangledName(gd), fnType, gd, + /*ForVtable=*/false, dontDefer, + /*IsThunk=*/false, isForDefinition); + + return {fnType, fn}; +} + cir::FuncOp CIRGenModule::getAddrOfFunction(clang::GlobalDecl gd, mlir::Type funcType, bool forVTable, bool dontDefer, @@ -1248,8 +1295,11 @@ StringRef CIRGenModule::getMangledName(GlobalDecl gd) { // Some ABIs don't have constructor variants. Make sure that base and complete // constructors get mangled the same. if (const auto *cd = dyn_cast(canonicalGd.getDecl())) { - errorNYI(cd->getSourceRange(), "getMangledName: C++ constructor"); - return cast(gd.getDecl())->getIdentifier()->getName(); + if (!getTarget().getCXXABI().hasConstructorVariants()) { + errorNYI(cd->getSourceRange(), + "getMangledName: C++ constructor without variants"); + return cast(gd.getDecl())->getIdentifier()->getName(); + } } // Keep the first result in the case of a mangling collision. diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h index 24ec9ca6403bc..9748c0b3ed43a 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.h +++ b/clang/lib/CIR/CodeGen/CIRGenModule.h @@ -166,11 +166,30 @@ class CIRGenModule : public CIRGenTypeCache { mlir::Location getLoc(clang::SourceLocation cLoc); mlir::Location getLoc(clang::SourceRange cRange); + /// Return the best known alignment for an unknown pointer to a + /// particular class. + clang::CharUnits getClassPointerAlignment(const clang::CXXRecordDecl *rd); + /// FIXME: this could likely be a common helper and not necessarily related /// with codegen. clang::CharUnits getNaturalTypeAlignment(clang::QualType t, LValueBaseInfo *baseInfo); + cir::FuncOp + getAddrOfCXXStructor(clang::GlobalDecl gd, + const CIRGenFunctionInfo *fnInfo = nullptr, + cir::FuncType fnType = nullptr, bool dontDefer = false, + ForDefinition_t isForDefinition = NotForDefinition) { + return getAddrAndTypeOfCXXStructor(gd, fnInfo, fnType, dontDefer, + isForDefinition) + .second; + } + + std::pair getAddrAndTypeOfCXXStructor( + clang::GlobalDecl gd, const CIRGenFunctionInfo *fnInfo = nullptr, + cir::FuncType fnType = nullptr, bool dontDefer = false, + ForDefinition_t isForDefinition = NotForDefinition); + /// This contains all the decls which have definitions but which are deferred /// for emission and therefore should only be output if they are actually /// used. If a decl is in this, then it is known to have not been referenced diff --git a/clang/lib/CIR/CodeGen/CIRGenTypes.h b/clang/lib/CIR/CodeGen/CIRGenTypes.h index 48d474beeddec..c2813d79bf63b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenTypes.h +++ b/clang/lib/CIR/CodeGen/CIRGenTypes.h @@ -19,6 +19,7 @@ #include "clang/AST/DeclCXX.h" #include "clang/AST/Type.h" +#include "clang/Basic/ABI.h" #include "clang/CIR/Dialect/IR/CIRTypes.h" #include "llvm/ADT/SmallPtrSet.h" @@ -165,6 +166,10 @@ class CIRGenTypes { bool isZeroInitializable(clang::QualType ty); bool isZeroInitializable(const RecordDecl *rd); + const CIRGenFunctionInfo &arrangeCXXConstructorCall( + const CallArgList &args, const clang::CXXConstructorDecl *d, + clang::CXXCtorType ctorKind, bool passProtoArgs = true); + const CIRGenFunctionInfo & arrangeCXXMethodCall(const CallArgList &args, const clang::FunctionProtoType *type, @@ -173,6 +178,7 @@ class CIRGenTypes { /// C++ methods have some special rules and also have implicit parameters. const CIRGenFunctionInfo & arrangeCXXMethodDeclaration(const clang::CXXMethodDecl *md); + const CIRGenFunctionInfo &arrangeCXXStructorDeclaration(clang::GlobalDecl gd); const CIRGenFunctionInfo & arrangeCXXMethodType(const clang::CXXRecordDecl *rd, diff --git a/clang/test/CIR/CodeGen/ctor.cpp b/clang/test/CIR/CodeGen/ctor.cpp new file mode 100644 index 0000000000000..3a1e82e338c1c --- /dev/null +++ b/clang/test/CIR/CodeGen/ctor.cpp @@ -0,0 +1,19 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s + +struct Struk { + int a; + Struk(); +}; + +void baz() { + Struk s; +} + +// CHECK: !rec_Struk = !cir.record + +// CHECK: cir.func @_ZN5StrukC1Ev(!cir.ptr) +// CHECK: cir.func @_Z3bazv() +// CHECK-NEXT: %[[S_ADDR:.*]] = cir.alloca !rec_Struk, !cir.ptr, ["s", init] {alignment = 4 : i64} +// CHECK-NEXT: cir.call @_ZN5StrukC1Ev(%[[S_ADDR]]) : (!cir.ptr) -> () +// CHECK-NEXT: cir.return From 6f62979a5a5bcf70d65f23e0991a274e6df5955b Mon Sep 17 00:00:00 2001 From: George Burgess IV Date: Tue, 10 Jun 2025 16:57:16 -0700 Subject: [PATCH 019/851] Revert "[CI] Migrate to runtimes build" (#143612) Reverts llvm/llvm-project#142696 See https://github.com/llvm/llvm-project/issues/143610 for details; I believe this PR causes CI builders to build LLVM in a way that's been broken for a while. To keep CI green, if this is the correct culprit, those tests should be fixed or skipped --- .ci/compute_projects.py | 115 ++++++++++++-------------------- .ci/compute_projects_test.py | 55 ++------------- .ci/monolithic-linux.sh | 13 +--- .github/workflows/premerge.yaml | 3 +- 4 files changed, 49 insertions(+), 137 deletions(-) diff --git a/.ci/compute_projects.py b/.ci/compute_projects.py index e61b8dc5021f3..40dd0507a9eaf 100644 --- a/.ci/compute_projects.py +++ b/.ci/compute_projects.py @@ -49,7 +49,8 @@ }, "lld": {"bolt", "cross-project-tests"}, # TODO(issues/132795): LLDB should be enabled on clang changes. - "clang": {"clang-tools-extra", "cross-project-tests"}, + "clang": {"clang-tools-extra", "compiler-rt", "cross-project-tests"}, + "clang-tools-extra": {"libc"}, "mlir": {"flang"}, # Test everything if ci scripts are changed. # FIXME: Figure out what is missing and add here. @@ -63,15 +64,7 @@ # This mapping describes runtimes that should be tested when the key project is # touched. -DEPENDENT_RUNTIMES_TO_TEST = { - "clang": {"compiler-rt"}, - "clang-tools-extra": {"libc"}, -} -DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG = { - "llvm": {"libcxx", "libcxxabi", "libunwind"}, - "clang": {"libcxx", "libcxxabi", "libunwind"}, - ".ci": {"libcxx", "libcxxabi", "libunwind"}, -} +DEPENDENT_RUNTIMES_TO_TEST = {"clang": {"libcxx", "libcxxabi", "libunwind"}} EXCLUDE_LINUX = { "cross-project-tests", # TODO(issues/132796): Tests are failing. @@ -100,6 +93,9 @@ "cross-project-tests", "flang", "libc", + "libcxx", + "libcxxabi", + "libunwind", "lldb", "openmp", "polly", @@ -126,10 +122,10 @@ "polly": "check-polly", } -RUNTIMES = {"libcxx", "libcxxabi", "libunwind", "compiler-rt", "libc"} +RUNTIMES = {"libcxx", "libcxxabi", "libunwind"} -def _add_dependencies(projects: Set[str], runtimes: Set[str]) -> Set[str]: +def _add_dependencies(projects: Set[str]) -> Set[str]: projects_with_dependents = set(projects) current_projects_count = 0 while current_projects_count != len(projects_with_dependents): @@ -138,25 +134,9 @@ def _add_dependencies(projects: Set[str], runtimes: Set[str]) -> Set[str]: if project not in PROJECT_DEPENDENCIES: continue projects_with_dependents.update(PROJECT_DEPENDENCIES[project]) - for runtime in runtimes: - if runtime not in PROJECT_DEPENDENCIES: - continue - projects_with_dependents.update(PROJECT_DEPENDENCIES[runtime]) return projects_with_dependents -def _exclude_projects(current_projects: Set[str], platform: str) -> Set[str]: - if platform == "Linux": - to_exclude = EXCLUDE_LINUX - elif platform == "Windows": - to_exclude = EXCLUDE_WINDOWS - elif platform == "Darwin": - to_exclude = EXCLUDE_MAC - else: - raise ValueError(f"Unexpected platform: {platform}") - return current_projects.difference(to_exclude) - - def _compute_projects_to_test(modified_projects: Set[str], platform: str) -> Set[str]: projects_to_test = set() for modified_project in modified_projects: @@ -174,14 +154,25 @@ def _compute_projects_to_test(modified_projects: Set[str], platform: str) -> Set ): continue projects_to_test.add(dependent_project) - projects_to_test = _exclude_projects(projects_to_test, platform) + if platform == "Linux": + for to_exclude in EXCLUDE_LINUX: + if to_exclude in projects_to_test: + projects_to_test.remove(to_exclude) + elif platform == "Windows": + for to_exclude in EXCLUDE_WINDOWS: + if to_exclude in projects_to_test: + projects_to_test.remove(to_exclude) + elif platform == "Darwin": + for to_exclude in EXCLUDE_MAC: + if to_exclude in projects_to_test: + projects_to_test.remove(to_exclude) + else: + raise ValueError("Unexpected platform.") return projects_to_test -def _compute_projects_to_build( - projects_to_test: Set[str], runtimes: Set[str] -) -> Set[str]: - return _add_dependencies(projects_to_test, runtimes) +def _compute_projects_to_build(projects_to_test: Set[str]) -> Set[str]: + return _add_dependencies(projects_to_test) def _compute_project_check_targets(projects_to_test: Set[str]) -> Set[str]: @@ -193,36 +184,24 @@ def _compute_project_check_targets(projects_to_test: Set[str]) -> Set[str]: return check_targets -def _compute_runtimes_to_test(modified_projects: Set[str], platform: str) -> Set[str]: +def _compute_runtimes_to_test(projects_to_test: Set[str]) -> Set[str]: runtimes_to_test = set() - for modified_project in modified_projects: - if modified_project not in DEPENDENT_RUNTIMES_TO_TEST: - continue - runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_TEST[modified_project]) - return _exclude_projects(runtimes_to_test, platform) + for project_to_test in projects_to_test: + if project_to_test in DEPENDENT_RUNTIMES_TO_TEST: + runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_TEST[project_to_test]) + if project_to_test in DEPENDENT_RUNTIMES_TO_BUILD: + runtimes_to_test.update(DEPENDENT_RUNTIMES_TO_BUILD[project_to_test]) + return runtimes_to_test -def _compute_runtimes_to_test_needs_reconfig( - modified_projects: Set[str], platform: str -) -> Set[str]: - runtimes_to_test = set() - for modified_project in modified_projects: - if modified_project not in DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG: +def _compute_runtime_check_targets(projects_to_test: Set[str]) -> Set[str]: + check_targets = set() + for project_to_test in projects_to_test: + if project_to_test not in DEPENDENT_RUNTIMES_TO_TEST: continue - runtimes_to_test.update( - DEPENDENT_RUNTIMES_TO_TEST_NEEDS_RECONFIG[modified_project] - ) - return _exclude_projects(runtimes_to_test, platform) - - -def _compute_runtimes_to_build( - runtimes_to_test: Set[str], modified_projects: Set[str], platform: str -) -> Set[str]: - runtimes_to_build = set(runtimes_to_test) - for modified_project in modified_projects: - if modified_project in DEPENDENT_RUNTIMES_TO_BUILD: - runtimes_to_build.update(DEPENDENT_RUNTIMES_TO_BUILD[modified_project]) - return _exclude_projects(runtimes_to_build, platform) + for runtime_to_test in DEPENDENT_RUNTIMES_TO_TEST[project_to_test]: + check_targets.add(PROJECT_CHECK_TARGETS[runtime_to_test]) + return check_targets def _get_modified_projects(modified_files: list[str]) -> Set[str]: @@ -246,19 +225,10 @@ def _get_modified_projects(modified_files: list[str]) -> Set[str]: def get_env_variables(modified_files: list[str], platform: str) -> Set[str]: modified_projects = _get_modified_projects(modified_files) projects_to_test = _compute_projects_to_test(modified_projects, platform) - runtimes_to_test = _compute_runtimes_to_test(modified_projects, platform) - runtimes_to_test_needs_reconfig = _compute_runtimes_to_test_needs_reconfig( - modified_projects, platform - ) - runtimes_to_build = _compute_runtimes_to_build( - runtimes_to_test | runtimes_to_test_needs_reconfig, modified_projects, platform - ) - projects_to_build = _compute_projects_to_build(projects_to_test, runtimes_to_build) + projects_to_build = _compute_projects_to_build(projects_to_test) projects_check_targets = _compute_project_check_targets(projects_to_test) - runtimes_check_targets = _compute_project_check_targets(runtimes_to_test) - runtimes_check_targets_needs_reconfig = _compute_project_check_targets( - runtimes_to_test_needs_reconfig - ) + runtimes_to_build = _compute_runtimes_to_test(projects_to_test) + runtimes_check_targets = _compute_runtime_check_targets(projects_to_test) # We use a semicolon to separate the projects/runtimes as they get passed # to the CMake invocation and thus we need to use the CMake list separator # (;). We use spaces to separate the check targets as they end up getting @@ -268,9 +238,6 @@ def get_env_variables(modified_files: list[str], platform: str) -> Set[str]: "project_check_targets": " ".join(sorted(projects_check_targets)), "runtimes_to_build": ";".join(sorted(runtimes_to_build)), "runtimes_check_targets": " ".join(sorted(runtimes_check_targets)), - "runtimes_check_targets_needs_reconfig": " ".join( - sorted(runtimes_check_targets_needs_reconfig) - ), } diff --git a/.ci/compute_projects_test.py b/.ci/compute_projects_test.py index 6bc2e34a1cbe1..ae376ea6a43cd 100644 --- a/.ci/compute_projects_test.py +++ b/.ci/compute_projects_test.py @@ -26,10 +26,6 @@ def test_llvm(self): ) self.assertEqual( env_variables["runtimes_check_targets"], - "", - ) - self.assertEqual( - env_variables["runtimes_check_targets_needs_reconfig"], "check-cxx check-cxxabi check-unwind", ) @@ -50,10 +46,6 @@ def test_llvm_windows(self): ) self.assertEqual( env_variables["runtimes_check_targets"], - "", - ) - self.assertEqual( - env_variables["runtimes_check_targets_needs_reconfig"], "check-cxx check-cxxabi check-unwind", ) @@ -74,10 +66,6 @@ def test_llvm_mac(self): ) self.assertEqual( env_variables["runtimes_check_targets"], - "", - ) - self.assertEqual( - env_variables["runtimes_check_targets_needs_reconfig"], "check-cxx check-cxxabi check-unwind", ) @@ -87,21 +75,17 @@ def test_clang(self): ) self.assertEqual( env_variables["projects_to_build"], - "clang;clang-tools-extra;lld;llvm", + "clang;clang-tools-extra;compiler-rt;lld;llvm", ) self.assertEqual( env_variables["project_check_targets"], - "check-clang check-clang-tools", + "check-clang check-clang-tools check-compiler-rt", ) self.assertEqual( - env_variables["runtimes_to_build"], "compiler-rt;libcxx;libcxxabi;libunwind" + env_variables["runtimes_to_build"], "libcxx;libcxxabi;libunwind" ) self.assertEqual( env_variables["runtimes_check_targets"], - "check-compiler-rt", - ) - self.assertEqual( - env_variables["runtimes_check_targets_needs_reconfig"], "check-cxx check-cxxabi check-unwind", ) @@ -120,10 +104,6 @@ def test_clang_windows(self): ) self.assertEqual( env_variables["runtimes_check_targets"], - "", - ) - self.assertEqual( - env_variables["runtimes_check_targets_needs_reconfig"], "check-cxx check-cxxabi check-unwind", ) @@ -135,7 +115,6 @@ def test_bolt(self): self.assertEqual(env_variables["project_check_targets"], "check-bolt") self.assertEqual(env_variables["runtimes_to_build"], "") self.assertEqual(env_variables["runtimes_check_targets"], "") - self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "") def test_lldb(self): env_variables = compute_projects.get_env_variables( @@ -145,7 +124,6 @@ def test_lldb(self): self.assertEqual(env_variables["project_check_targets"], "check-lldb") self.assertEqual(env_variables["runtimes_to_build"], "") self.assertEqual(env_variables["runtimes_check_targets"], "") - self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "") def test_mlir(self): env_variables = compute_projects.get_env_variables( @@ -157,7 +135,6 @@ def test_mlir(self): ) self.assertEqual(env_variables["runtimes_to_build"], "") self.assertEqual(env_variables["runtimes_check_targets"], "") - self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "") def test_flang(self): env_variables = compute_projects.get_env_variables( @@ -167,7 +144,6 @@ def test_flang(self): self.assertEqual(env_variables["project_check_targets"], "check-flang") self.assertEqual(env_variables["runtimes_to_build"], "") self.assertEqual(env_variables["runtimes_check_targets"], "") - self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "") def test_invalid_subproject(self): env_variables = compute_projects.get_env_variables( @@ -177,7 +153,6 @@ def test_invalid_subproject(self): self.assertEqual(env_variables["project_check_targets"], "") self.assertEqual(env_variables["runtimes_to_build"], "") self.assertEqual(env_variables["runtimes_check_targets"], "") - self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "") def test_top_level_file(self): env_variables = compute_projects.get_env_variables(["README.md"], "Linux") @@ -185,7 +160,6 @@ def test_top_level_file(self): self.assertEqual(env_variables["project_check_targets"], "") self.assertEqual(env_variables["runtimes_to_build"], "") self.assertEqual(env_variables["runtimes_check_targets"], "") - self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "") def test_exclude_runtiems_in_projects(self): env_variables = compute_projects.get_env_variables( @@ -195,7 +169,6 @@ def test_exclude_runtiems_in_projects(self): self.assertEqual(env_variables["project_check_targets"], "") self.assertEqual(env_variables["runtimes_to_build"], "") self.assertEqual(env_variables["runtimes_check_targets"], "") - self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "") def test_exclude_docs(self): env_variables = compute_projects.get_env_variables( @@ -205,7 +178,6 @@ def test_exclude_docs(self): self.assertEqual(env_variables["project_check_targets"], "") self.assertEqual(env_variables["runtimes_to_build"], "") self.assertEqual(env_variables["runtimes_check_targets"], "") - self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "") def test_exclude_gn(self): env_variables = compute_projects.get_env_variables( @@ -215,7 +187,6 @@ def test_exclude_gn(self): self.assertEqual(env_variables["project_check_targets"], "") self.assertEqual(env_variables["runtimes_to_build"], "") self.assertEqual(env_variables["runtimes_check_targets"], "") - self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "") def test_ci(self): env_variables = compute_projects.get_env_variables( @@ -227,15 +198,10 @@ def test_ci(self): "check-clang check-lld check-lldb check-llvm", ) self.assertEqual( - env_variables["runtimes_to_build"], - "libcxx;libcxxabi;libunwind", + env_variables["runtimes_to_build"], "libcxx;libcxxabi;libunwind" ) self.assertEqual( env_variables["runtimes_check_targets"], - "", - ) - self.assertEqual( - env_variables["runtimes_check_targets_needs_reconfig"], "check-cxx check-cxxabi check-unwind", ) @@ -249,19 +215,6 @@ def test_lldb(self): env_variables["runtimes_to_build"], "libcxx;libcxxabi;libunwind" ) self.assertEqual(env_variables["runtimes_check_targets"], "") - self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "") - - def test_clang_tools_extra(self): - env_variables = compute_projects.get_env_variables( - ["clang-tools-extra/CMakeLists.txt"], "Linux" - ) - self.assertEqual( - env_variables["projects_to_build"], "clang;clang-tools-extra;lld;llvm" - ) - self.assertEqual(env_variables["project_check_targets"], "check-clang-tools") - self.assertEqual(env_variables["runtimes_to_build"], "libc") - self.assertEqual(env_variables["runtimes_check_targets"], "check-libc") - self.assertEqual(env_variables["runtimes_check_targets_needs_reconfig"], "") if __name__ == "__main__": diff --git a/.ci/monolithic-linux.sh b/.ci/monolithic-linux.sh index c350a58679140..7503ea4e6a992 100755 --- a/.ci/monolithic-linux.sh +++ b/.ci/monolithic-linux.sh @@ -57,7 +57,6 @@ projects="${1}" targets="${2}" runtimes="${3}" runtime_targets="${4}" -runtime_targets_needs_reconfig="${5}" lit_args="-v --xunit-xml-output ${BUILD_DIR}/test-results.xml --use-unique-output-file-name --timeout=1200 --time-tests" @@ -94,15 +93,9 @@ echo "--- ninja" # Targets are not escaped as they are passed as separate arguments. ninja -C "${BUILD_DIR}" -k 0 ${targets} -if [[ "${runtime_targets}" != "" ]]; then - echo "--- ninja runtimes" - - ninja -C "${BUILD_DIR}" ${runtime_targets} -fi - # Compiling runtimes with just-built Clang and running their tests # as an additional testing for Clang. -if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then +if [[ "${runtimes_targets}" != "" ]]; then echo "--- cmake runtimes C++26" cmake \ @@ -112,7 +105,7 @@ if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then echo "--- ninja runtimes C++26" - ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig} + ninja -C "${BUILD_DIR}" ${runtime_targets} echo "--- cmake runtimes clang modules" @@ -123,5 +116,5 @@ if [[ "${runtime_targets_needs_reconfig}" != "" ]]; then echo "--- ninja runtimes clang modules" - ninja -C "${BUILD_DIR}" ${runtime_targets_needs_reconfig} + ninja -C "${BUILD_DIR}" ${runtime_targets} fi diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml index 4435a3e905768..709b6d03d94c3 100644 --- a/.github/workflows/premerge.yaml +++ b/.github/workflows/premerge.yaml @@ -56,12 +56,11 @@ jobs: echo "Running project checks targets: ${project_check_targets}" echo "Building runtimes: ${runtimes_to_build}" echo "Running runtimes checks targets: ${runtimes_check_targets}" - echo "Running runtimes checks requiring reconfiguring targets: ${runtimes_check_targets_needs_reconfig}" export CC=/opt/llvm/bin/clang export CXX=/opt/llvm/bin/clang++ - ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}" "${runtimes_check_targets_needs_reconfig}" + ./.ci/monolithic-linux.sh "${projects_to_build}" "${project_check_targets}" "${runtimes_to_build}" "${runtimes_check_targets}" - name: Upload Artifacts uses: actions/upload-artifact@65c4c4a1ddee5b72f698fdd19549f0f0fb45cf08 # v4.6.0 with: From 3cef099ceddccefca8e11268624397cde9e04af6 Mon Sep 17 00:00:00 2001 From: Aiden Grossman Date: Wed, 11 Jun 2025 01:06:13 +0000 Subject: [PATCH 020/851] [TySan][CMake] Depend on tysan for check-tysan in runtimes build (#143597) The runtimes build expects libclang_rt.tysan.a to be available, but the check-tysan target does not actually depend on it when built using a runtimes build with LLVM_ENABLE_RUNTIMES pointing at ./llvm. This means we get test failures when running check-compiler-rt due to the missing static archive. This patch also makes check-tysan depend on tysan when we are using the runtimes build. This is causing premerge failures currently since we recently migrated to the runtimes build. --- compiler-rt/test/tysan/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/compiler-rt/test/tysan/CMakeLists.txt b/compiler-rt/test/tysan/CMakeLists.txt index 76f57501e854e..ce0afa8769f03 100644 --- a/compiler-rt/test/tysan/CMakeLists.txt +++ b/compiler-rt/test/tysan/CMakeLists.txt @@ -21,9 +21,7 @@ foreach(arch ${TYSAN_TEST_ARCH}) endforeach() set(TYSAN_TEST_DEPS ${SANITIZER_COMMON_LIT_TEST_DEPS}) -if(NOT COMPILER_RT_STANDALONE_BUILD) - list(APPEND TYSAN_TEST_DEPS tysan) -endif() +list(APPEND TYSAN_TEST_DEPS tysan) add_lit_testsuite(check-tysan "Running the TypeSanitizer tests" ${TYSAN_TESTSUITES} From 67ff66e67734c0b283ec676899e5b89b67fdafcb Mon Sep 17 00:00:00 2001 From: Ethan Luis McDonough Date: Tue, 10 Jun 2025 20:19:38 -0500 Subject: [PATCH 021/851] [PGO][Offload] Fix offload coverage mapping (#143490) This pull request fixes coverage mapping on GPU targets. - It adds an address space cast to the coverage mapping generation pass. - It reads the profiled function names from the ELF directly. Reading it from public globals was causing issues in cases where multiple device-code object files are linked together. --- clang/lib/CodeGen/CoverageMappingGen.cpp | 5 +-- .../Instrumentation/InstrProfiling.cpp | 6 ---- .../common/include/GlobalHandler.h | 4 +-- .../common/src/GlobalHandler.cpp | 31 +++++++++---------- .../common/src/PluginInterface.cpp | 7 ++--- 5 files changed, 22 insertions(+), 31 deletions(-) diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp index 1788bb4f28697..4aafac349e3e9 100644 --- a/clang/lib/CodeGen/CoverageMappingGen.cpp +++ b/clang/lib/CodeGen/CoverageMappingGen.cpp @@ -2622,8 +2622,9 @@ void CoverageMappingModuleGen::emit() { CGM.addUsedGlobal(CovData); // Create the deferred function records array if (!FunctionNames.empty()) { - auto NamesArrTy = llvm::ArrayType::get(llvm::PointerType::getUnqual(Ctx), - FunctionNames.size()); + auto AddrSpace = FunctionNames.front()->getType()->getPointerAddressSpace(); + auto NamesArrTy = llvm::ArrayType::get( + llvm::PointerType::get(Ctx, AddrSpace), FunctionNames.size()); auto NamesArrVal = llvm::ConstantArray::get(NamesArrTy, FunctionNames); // This variable will *NOT* be emitted to the object file. It is used // to pass the list of names referenced to codegen. diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp index fe3b0da33a009..5e7548b0a2fd1 100644 --- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp +++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp @@ -1955,12 +1955,6 @@ void InstrLowerer::emitNameData() { GlobalValue::PrivateLinkage, NamesVal, getInstrProfNamesVarName()); - // Make names variable public if current target is a GPU - if (isGPUProfTarget(M)) { - NamesVar->setLinkage(GlobalValue::ExternalLinkage); - NamesVar->setVisibility(GlobalValue::VisibilityTypes::ProtectedVisibility); - } - NamesSize = CompressedNameStr.size(); setGlobalVariableLargeSection(TT, *NamesVar); NamesVar->setSection( diff --git a/offload/plugins-nextgen/common/include/GlobalHandler.h b/offload/plugins-nextgen/common/include/GlobalHandler.h index 6def53430a7c0..5d6109df49da5 100644 --- a/offload/plugins-nextgen/common/include/GlobalHandler.h +++ b/offload/plugins-nextgen/common/include/GlobalHandler.h @@ -80,6 +80,7 @@ struct GPUProfGlobals { void dump() const; Error write() const; + bool empty() const; }; /// Subclass of GlobalTy that holds the memory for a global of \p Ty. @@ -192,9 +193,6 @@ class GenericGlobalHandlerTy { /*D2H=*/false); } - /// Checks whether a given image contains profiling globals. - bool hasProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image); - /// Reads profiling data from a GPU image to supplied profdata struct. /// Iterates through the image symbol table and stores global values /// with profiling prefixes. diff --git a/offload/plugins-nextgen/common/src/GlobalHandler.cpp b/offload/plugins-nextgen/common/src/GlobalHandler.cpp index 27d7e8ee2fdf3..5464c197dba78 100644 --- a/offload/plugins-nextgen/common/src/GlobalHandler.cpp +++ b/offload/plugins-nextgen/common/src/GlobalHandler.cpp @@ -173,16 +173,6 @@ Error GenericGlobalHandlerTy::readGlobalFromImage(GenericDeviceTy &Device, return Plugin::success(); } -bool GenericGlobalHandlerTy::hasProfilingGlobals(GenericDeviceTy &Device, - DeviceImageTy &Image) { - GlobalTy global(getInstrProfNamesVarName().str(), 0); - if (auto Err = getGlobalMetadataFromImage(Device, Image, global)) { - consumeError(std::move(Err)); - return false; - } - return true; -} - Expected GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, DeviceImageTy &Image) { @@ -204,12 +194,17 @@ GenericGlobalHandlerTy::readProfilingGlobals(GenericDeviceTy &Device, // Check if given current global is a profiling global based // on name if (*NameOrErr == getInstrProfNamesVarName()) { - // Read in profiled function names - DeviceProfileData.NamesData = SmallVector(Sym.getSize(), 0); - GlobalTy NamesGlobal(NameOrErr->str(), Sym.getSize(), - DeviceProfileData.NamesData.data()); - if (auto Err = readGlobalFromDevice(Device, Image, NamesGlobal)) - return Err; + // Read in profiled function names from ELF + auto SectionOrErr = Sym.getSection(); + if (!SectionOrErr) + return SectionOrErr.takeError(); + + auto ContentsOrErr = (*SectionOrErr)->getContents(); + if (!ContentsOrErr) + return ContentsOrErr.takeError(); + + SmallVector NameBytes(ContentsOrErr->bytes()); + DeviceProfileData.NamesData = NameBytes; } else if (NameOrErr->starts_with(getInstrProfCountersVarPrefix())) { // Read global variable profiling counts SmallVector Counts(Sym.getSize() / sizeof(int64_t), 0); @@ -322,3 +317,7 @@ Error GPUProfGlobals::write() const { return Plugin::success(); } + +bool GPUProfGlobals::empty() const { + return Counts.empty() && Data.empty() && NamesData.empty(); +} diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp index f9e316adad8f4..f9a6b3c1f4324 100644 --- a/offload/plugins-nextgen/common/src/PluginInterface.cpp +++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp @@ -858,14 +858,13 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) { for (auto *Image : LoadedImages) { GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler(); - if (!Handler.hasProfilingGlobals(*this, *Image)) - continue; - - GPUProfGlobals profdata; auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image); if (!ProfOrErr) return ProfOrErr.takeError(); + if (ProfOrErr->empty()) + continue; + // Dump out profdata if ((OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::PGODump)) == uint32_t(DeviceDebugKind::PGODump)) From 841a7f0897272f6412bc2e42a7dd695bf1e8a8cf Mon Sep 17 00:00:00 2001 From: Min-Yih Hsu Date: Tue, 10 Jun 2025 18:30:07 -0700 Subject: [PATCH 022/851] [RISCV][NFC] Factor out VLEN in the SiFive7 scheduling model (#143629) In preparation of reusing SiFive7Model for sifive-x390, which has a VLEN of 1024, it's better (and less chaotic) to factor out the VLEN parameter from various of places first: the plan is to do a major overhaul on this file in which all the `WriteRes` are going to be encapsulated in a big `multiclass`, where VLEN is one of its template arguments. Such that we can instantiate different scheduling models with different VLEN. Before that happens, a placeholder defvar `SiFive7VLEN` is used instead in this patch. NFC. Co-authored-by: Michael Maitland --- llvm/lib/Target/RISCV/RISCVSchedSiFive7.td | 64 ++++++++++------------ 1 file changed, 30 insertions(+), 34 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td index af64a871a9292..c1d7cd4a716e7 100644 --- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td +++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td @@ -88,9 +88,8 @@ class SiFive7GetCyclesSegmentedSeg2 { // Cycles for segmented loads and stores are calculated using the // formula vl * ceil((SEW * nf) / DLEN), where SEW * nf is the segment size. -class SiFive7GetCyclesSegmented { - defvar VLEN = 512; - defvar DLEN = 256; +class SiFive7GetCyclesSegmented { + defvar DLEN = !div(VLEN, 2); // (VLEN * LMUL) / SEW defvar VLUpperBound = !cond( !eq(mx, "M1") : !div(VLEN, sew), @@ -107,23 +106,20 @@ class SiFive7GetCyclesSegmented { int c = !mul(VLUpperBound, !div(!sub(!add(a, b), 1), b)); } -class SiFive7GetCyclesOnePerElement { - // FIXME: On SiFive7, VLEN is 512. Although a user can request the compiler - // to use a different VLEN, this model will not make scheduling decisions - // based on the user specified VLEN. +class SiFive7GetCyclesOnePerElement { // c = ceil(VLEN / SEW) * LMUL // Note: c >= 1 since the smallest VLEN is 512 / 8 = 8, and the // largest division performed on VLEN is in MF8 case with division // by 8. Therefore, there is no need to ceil the result. - int VLEN = !div(512, sew); + int numElements = !div(VLEN, sew); int c = !cond( - !eq(mx, "M1") : VLEN, - !eq(mx, "M2") : !mul(VLEN, 2), - !eq(mx, "M4") : !mul(VLEN, 4), - !eq(mx, "M8") : !mul(VLEN, 8), - !eq(mx, "MF2") : !div(VLEN, 2), - !eq(mx, "MF4") : !div(VLEN, 4), - !eq(mx, "MF8") : !div(VLEN, 8) + !eq(mx, "M1") : numElements, + !eq(mx, "M2") : !mul(numElements, 2), + !eq(mx, "M4") : !mul(numElements, 4), + !eq(mx, "M8") : !mul(numElements, 8), + !eq(mx, "MF2") : !div(numElements, 2), + !eq(mx, "MF4") : !div(numElements, 4), + !eq(mx, "MF8") : !div(numElements, 8) ); } @@ -139,10 +135,9 @@ class SiFive7GetDivOrSqrtFactor { /// Cycles for reductions take approximately VL*SEW/DLEN + 5(4 + log(DLEN/SEW)) /// cycles. -class SiFive7GetReductionCycles { +class SiFive7GetReductionCycles { // VLUpperBound*SEW/DLEN is equivalent to 2*LMUL since // VLUpperBound=(VLEN*LMUL)/SEW. - defvar VLEN = 512; defvar DLEN = !div(VLEN, 2); defvar TwoTimesLMUL = !cond( !eq(mx, "M1") : 2, @@ -160,8 +155,7 @@ class SiFive7GetReductionCycles { } /// Cycles for ordered reductions take approximately 6*VL cycles -class SiFive7GetOrderedReductionCycles { - defvar VLEN = 512; +class SiFive7GetOrderedReductionCycles { // (VLEN * LMUL) / SEW defvar VLUpperBound = !cond( !eq(mx, "M1") : !div(VLEN, sew), @@ -234,6 +228,8 @@ def SiFive7VCQ : ProcResource<1>; // Vector Command Queue def SiFive7PipeAB : ProcResGroup<[SiFive7PipeA, SiFive7PipeB]>; +defvar SiFive7VLEN = 512; + // Branching let Latency = 3 in { def : WriteRes; @@ -481,7 +477,7 @@ foreach mx = SchedMxList in { foreach mx = SchedMxList in { defvar VLDSX0Cycles = SiFive7GetCyclesDefault.c; - defvar Cycles = SiFive7GetCyclesOnePerElement.c; + defvar Cycles = SiFive7GetCyclesOnePerElement.c; defvar IsWorstCase = SiFive7IsWorstCaseMX.c; defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS8", VLDSX0Pred, [SiFive7VCQ, SiFive7VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles), @@ -501,7 +497,7 @@ foreach mx = SchedMxList in { // since LMUL >= 16/64. foreach mx = ["MF4", "MF2", "M1", "M2", "M4", "M8"] in { defvar VLDSX0Cycles = SiFive7GetCyclesDefault.c; - defvar Cycles = SiFive7GetCyclesOnePerElement.c; + defvar Cycles = SiFive7GetCyclesOnePerElement.c; defvar IsWorstCase = SiFive7IsWorstCaseMX.c; defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS16", VLDSX0Pred, [SiFive7VCQ, SiFive7VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles), @@ -518,7 +514,7 @@ foreach mx = ["MF4", "MF2", "M1", "M2", "M4", "M8"] in { } foreach mx = ["MF2", "M1", "M2", "M4", "M8"] in { defvar VLDSX0Cycles = SiFive7GetCyclesDefault.c; - defvar Cycles = SiFive7GetCyclesOnePerElement.c; + defvar Cycles = SiFive7GetCyclesOnePerElement.c; defvar IsWorstCase = SiFive7IsWorstCaseMX.c; defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS32", VLDSX0Pred, [SiFive7VCQ, SiFive7VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles), @@ -535,7 +531,7 @@ foreach mx = ["MF2", "M1", "M2", "M4", "M8"] in { } foreach mx = ["M1", "M2", "M4", "M8"] in { defvar VLDSX0Cycles = SiFive7GetCyclesDefault.c; - defvar Cycles = SiFive7GetCyclesOnePerElement.c; + defvar Cycles = SiFive7GetCyclesOnePerElement.c; defvar IsWorstCase = SiFive7IsWorstCaseMX.c; defm SiFive7 : LMULWriteResMXVariant<"WriteVLDS64", VLDSX0Pred, [SiFive7VCQ, SiFive7VL], 4, [0, 1], [1, !add(1, VLDSX0Cycles)], !add(3, Cycles), @@ -588,7 +584,7 @@ foreach mx = SchedMxList in { let Latency = 1, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in defm "" : LMULWriteResMX<"WriteVSSEG2e" # eew, [SiFive7VCQ, SiFive7VS], mx, IsWorstCase>; foreach nf=3-8 in { - defvar Cycles = SiFive7GetCyclesSegmented.c; + defvar Cycles = SiFive7GetCyclesSegmented.c; defvar IsWorstCase = SiFive7IsWorstCaseMX.c; // Does not chain so set latency high let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { @@ -603,7 +599,7 @@ foreach mx = SchedMxList in { foreach mx = SchedMxList in { foreach nf=2-8 in { foreach eew = [8, 16, 32, 64] in { - defvar Cycles = SiFive7GetCyclesSegmented.c; + defvar Cycles = SiFive7GetCyclesSegmented.c; defvar IsWorstCase = SiFive7IsWorstCaseMX.c; // Does not chain so set latency high let Latency = !add(3, Cycles), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { @@ -669,7 +665,7 @@ foreach mx = SchedMxList in { foreach mx = SchedMxList in { foreach sew = SchedSEWSet.val in { defvar Cycles = !mul(SiFive7GetDivOrSqrtFactor.c, - !div(SiFive7GetCyclesOnePerElement.c, 4)); + !div(SiFive7GetCyclesOnePerElement.c, 4)); defvar IsWorstCase = SiFive7IsWorstCaseMXSEW.c; let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { defm "" : LMULSEWWriteResMXSEW<"WriteVIDivV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>; @@ -774,7 +770,7 @@ foreach mx = SchedMxList in { foreach mx = SchedMxListF in { foreach sew = SchedSEWSet.val in { defvar Cycles = !mul(SiFive7GetDivOrSqrtFactor.c, - !div(SiFive7GetCyclesOnePerElement.c, 4)); + !div(SiFive7GetCyclesOnePerElement.c, 4)); defvar IsWorstCase = SiFive7IsWorstCaseMXSEW.c; let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { defm "" : LMULSEWWriteResMXSEW<"WriteVFSqrtV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>; @@ -834,7 +830,7 @@ foreach mx = SchedMxListFW in { // 14. Vector Reduction Operations foreach mx = SchedMxList in { foreach sew = SchedSEWSet.val in { - defvar Cycles = SiFive7GetReductionCycles.c; + defvar Cycles = SiFive7GetReductionCycles.c; defvar IsWorstCase = SiFive7IsWorstCaseMXSEW.c; let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { defm "" : LMULSEWWriteResMXSEW<"WriteVIRedV_From", [SiFive7VCQ, SiFive7VA], @@ -847,7 +843,7 @@ foreach mx = SchedMxList in { foreach mx = SchedMxListWRed in { foreach sew = SchedSEWSet.val in { - defvar Cycles = SiFive7GetReductionCycles.c; + defvar Cycles = SiFive7GetReductionCycles.c; defvar IsWorstCase = SiFive7IsWorstCaseMXSEW.c; let Latency = Cycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in defm "" : LMULSEWWriteResMXSEW<"WriteVIWRedV_From", [SiFive7VCQ, SiFive7VA], @@ -857,7 +853,7 @@ foreach mx = SchedMxListWRed in { foreach mx = SchedMxListF in { foreach sew = SchedSEWSet.val in { - defvar RedCycles = SiFive7GetReductionCycles.c; + defvar RedCycles = SiFive7GetReductionCycles.c; defvar IsWorstCase = SiFive7IsWorstCaseMXSEW.c; let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, RedCycles)] in { defm "" : LMULSEWWriteResMXSEW<"WriteVFRedV_From", [SiFive7VCQ, SiFive7VA], @@ -865,7 +861,7 @@ foreach mx = SchedMxListF in { defm "" : LMULSEWWriteResMXSEW<"WriteVFRedMinMaxV_From", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>; } - defvar OrdRedCycles = SiFive7GetOrderedReductionCycles.c; + defvar OrdRedCycles = SiFive7GetOrderedReductionCycles.c; let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, OrdRedCycles)] in defm "" : LMULSEWWriteResMXSEW<"WriteVFRedOV_From", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>; @@ -874,12 +870,12 @@ foreach mx = SchedMxListF in { foreach mx = SchedMxListFWRed in { foreach sew = SchedSEWSet.val in { - defvar RedCycles = SiFive7GetReductionCycles.c; + defvar RedCycles = SiFive7GetReductionCycles.c; defvar IsWorstCase = SiFive7IsWorstCaseMXSEW.c; let Latency = RedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, RedCycles)] in defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedV_From", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>; - defvar OrdRedCycles = SiFive7GetOrderedReductionCycles.c; + defvar OrdRedCycles = SiFive7GetOrderedReductionCycles.c; let Latency = OrdRedCycles, AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, OrdRedCycles)] in defm "" : LMULSEWWriteResMXSEW<"WriteVFWRedOV_From", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>; @@ -924,7 +920,7 @@ foreach mx = SchedMxList in { foreach mx = SchedMxList in { foreach sew = SchedSEWSet.val in { - defvar Cycles = SiFive7GetCyclesOnePerElement.c; + defvar Cycles = SiFive7GetCyclesOnePerElement.c; defvar IsWorstCase = SiFive7IsWorstCaseMXSEW.c; let Latency = !add(Cycles, 3), AcquireAtCycles = [0, 1], ReleaseAtCycles = [1, !add(1, Cycles)] in { defm "" : LMULSEWWriteResMXSEW<"WriteVRGatherVV", [SiFive7VCQ, SiFive7VA], mx, sew, IsWorstCase>; From 8c890eaa3f4cedb494dc2a8180d9c9219bf76900 Mon Sep 17 00:00:00 2001 From: Iris Shi <0.0@owo.li> Date: Wed, 11 Jun 2025 10:19:12 +0800 Subject: [PATCH 023/851] Revert "[SelectionDAG] Make `(a & x) | (~a & y) -> (a & (x ^ y)) ^ y` available for all targets" (#143648) --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 57 -- .../Target/SystemZ/SystemZISelLowering.cpp | 14 - llvm/lib/Target/SystemZ/SystemZISelLowering.h | 1 - llvm/lib/Target/X86/X86ISelLowering.cpp | 58 ++ llvm/test/CodeGen/AMDGPU/bfi_int.ll | 30 +- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 42 +- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 161 ++--- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 42 +- ...unfold-masked-merge-scalar-variablemask.ll | 42 +- ...unfold-masked-merge-vector-variablemask.ll | 167 ++--- llvm/test/CodeGen/RISCV/fold-masked-merge.ll | 302 --------- ...unfold-masked-merge-scalar-variablemask.ll | 62 +- .../test/CodeGen/SystemZ/fold-masked-merge.ll | 277 -------- llvm/test/CodeGen/WebAssembly/simd-arith.ll | 600 +++++++++++------- llvm/test/CodeGen/X86/bitselect.ll | 50 +- llvm/test/CodeGen/X86/fold-masked-merge.ll | 30 +- ...unfold-masked-merge-scalar-variablemask.ll | 26 +- ...unfold-masked-merge-vector-variablemask.ll | 598 ++++++++--------- 18 files changed, 1059 insertions(+), 1500 deletions(-) delete mode 100644 llvm/test/CodeGen/RISCV/fold-masked-merge.ll delete mode 100644 llvm/test/CodeGen/SystemZ/fold-masked-merge.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b0da536a3b157..b65e8e06eae62 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -8128,59 +8128,6 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1, return SDValue(); } -static SDValue foldMaskedMergeImpl(SDValue AndL0, SDValue AndR0, SDValue AndL1, - SDValue AndR1, const SDLoc &DL, - SelectionDAG &DAG) { - if (!isBitwiseNot(AndL0, true) || !AndL0->hasOneUse()) - return SDValue(); - SDValue NotOp = AndL0->getOperand(0); - if (NotOp == AndR1) - std::swap(AndR1, AndL1); - if (NotOp != AndL1) - return SDValue(); - - EVT VT = AndL1->getValueType(0); - SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, AndR1, AndR0); - SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp); - SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, AndR0); - return Xor1; -} - -/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the -/// equivalent `((x ^ y) & m) ^ y)` pattern. -/// This is typically a better representation for targets without a fused -/// "and-not" operation. -static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG, - const TargetLowering &TLI, const SDLoc &DL) { - // Note that masked-merge variants using XOR or ADD expressions are - // normalized to OR by InstCombine so we only check for OR. - assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node"); - SDValue N0 = Node->getOperand(0); - if (N0->getOpcode() != ISD::AND || !N0->hasOneUse()) - return SDValue(); - SDValue N1 = Node->getOperand(1); - if (N1->getOpcode() != ISD::AND || !N1->hasOneUse()) - return SDValue(); - - // If the target supports and-not, don't fold this. - if (TLI.hasAndNot(SDValue(Node, 0))) - return SDValue(); - - SDValue N00 = N0->getOperand(0); - SDValue N01 = N0->getOperand(1); - SDValue N10 = N1->getOperand(0); - SDValue N11 = N1->getOperand(1); - if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG)) - return Result; - if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG)) - return Result; - if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG)) - return Result; - if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG)) - return Result; - return SDValue(); -} - SDValue DAGCombiner::visitOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -8359,10 +8306,6 @@ SDValue DAGCombiner::visitOR(SDNode *N) { if (SDValue R = foldLogicTreeOfShifts(N, N0, N1, DAG)) return R; - if (VT.isScalarInteger() && VT != MVT::i1) - if (SDValue R = foldMaskedMerge(N, DAG, TLI, DL)) - return R; - return SDValue(); } diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index 1c59b1e63b7bc..f06246706aaa9 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1283,20 +1283,6 @@ bool SystemZTargetLowering::allowsMisalignedMemoryAccesses( return true; } -bool SystemZTargetLowering::hasAndNot(SDValue Y) const { - EVT VT = Y.getValueType(); - - // We can use NC(G)RK for types in GPRs ... - if (VT == MVT::i32 || VT == MVT::i64) - return Subtarget.hasMiscellaneousExtensions3(); - - // ... or VNC for types in VRs. - if (VT.isVector() || VT == MVT::i128) - return Subtarget.hasVector(); - - return false; -} - // Information about the addressing mode for a memory access. struct AddressingMode { // True if a long displacement is supported. diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index f2f0bf6d8b410..f3536a840fda8 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -671,7 +671,6 @@ class SystemZTargetLowering : public TargetLowering { } unsigned getStackProbeSize(const MachineFunction &MF) const; - bool hasAndNot(SDValue Y) const override; private: const SystemZSubtarget &Subtarget; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 68da901c2f123..96be91256915d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -52350,6 +52350,59 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret); } +static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, + SDValue And1_L, SDValue And1_R, + const SDLoc &DL, SelectionDAG &DAG) { + if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse()) + return SDValue(); + SDValue NotOp = And0_L->getOperand(0); + if (NotOp == And1_R) + std::swap(And1_R, And1_L); + if (NotOp != And1_L) + return SDValue(); + + // (~(NotOp) & And0_R) | (NotOp & And1_R) + // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R + EVT VT = And1_L->getValueType(0); + SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R); + SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R); + SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp); + SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R); + return Xor1; +} + +/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the +/// equivalent `((x ^ y) & m) ^ y)` pattern. +/// This is typically a better representation for targets without a fused +/// "and-not" operation. This function is intended to be called from a +/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes. +static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) { + // Note that masked-merge variants using XOR or ADD expressions are + // normalized to OR by InstCombine so we only check for OR. + assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node"); + SDValue N0 = Node->getOperand(0); + if (N0->getOpcode() != ISD::AND || !N0->hasOneUse()) + return SDValue(); + SDValue N1 = Node->getOperand(1); + if (N1->getOpcode() != ISD::AND || !N1->hasOneUse()) + return SDValue(); + + SDLoc DL(Node); + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + SDValue N10 = N1->getOperand(0); + SDValue N11 = N1->getOperand(1); + if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG)) + return Result; + return SDValue(); +} + /// If this is an add or subtract where one operand is produced by a cmp+setcc, /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} /// with CMP+{ADC, SBB}. @@ -52753,6 +52806,11 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, } } + // We should fold "masked merge" patterns when `andn` is not available. + if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1) + if (SDValue R = foldMaskedMerge(N, DAG)) + return R; + if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG)) return R; diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll index b372dec383344..201b97d479c68 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -16,9 +16,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_xor_b32 s1, s1, s2 +; GFX7-NEXT: s_andn2_b32 s2, s2, s0 ; GFX7-NEXT: s_and_b32 s0, s1, s0 -; GFX7-NEXT: s_xor_b32 s0, s0, s2 +; GFX7-NEXT: s_or_b32 s0, s2, s0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -28,9 +28,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b32 s1, s1, s2 +; GFX8-NEXT: s_andn2_b32 s2, s2, s0 ; GFX8-NEXT: s_and_b32 s0, s1, s0 -; GFX8-NEXT: s_xor_b32 s0, s0, s2 +; GFX8-NEXT: s_or_b32 s0, s2, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -44,9 +44,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b32 s1, s1, s2 +; GFX10-NEXT: s_andn2_b32 s2, s2, s0 ; GFX10-NEXT: s_and_b32 s0, s1, s0 -; GFX10-NEXT: s_xor_b32 s0, s0, s2 +; GFX10-NEXT: s_or_b32 s0, s2, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm @@ -1407,9 +1407,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] -; GFX7-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX7-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] +; GFX7-NEXT: s_andn2_b64 s[0:1], s[4:5], s[0:1] +; GFX7-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX7-NEXT: s_add_u32 s0, s0, 10 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -1422,9 +1422,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] -; GFX8-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX8-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] +; GFX8-NEXT: s_andn2_b64 s[0:1], s[4:5], s[0:1] +; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX8-NEXT: s_add_u32 s0, s0, 10 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1438,9 +1438,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] -; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] -; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] +; GFX10-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] +; GFX10-NEXT: s_andn2_b64 s[0:1], s[4:5], s[0:1] +; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] ; GFX10-NEXT: s_add_u32 s0, s0, 10 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index e1b4cad370f96..6925a98f643b9 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -289,16 +289,16 @@ entry: define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) { ; GCN-LABEL: half4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] ; GCN-NEXT: s_lshl_b32 s6, s6, 4 ; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] +; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -317,10 +317,10 @@ define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 -; GCN-NEXT: s_xor_b32 s4, s2, 0x3c003c00 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 -; GCN-NEXT: s_and_b32 s3, s4, s3 -; GCN-NEXT: s_xor_b32 s2, s3, s2 +; GCN-NEXT: s_andn2_b32 s2, s2, s3 +; GCN-NEXT: s_and_b32 s3, s3, 0x3c003c00 +; GCN-NEXT: s_or_b32 s2, s3, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -399,10 +399,10 @@ define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 -; GCN-NEXT: s_xor_b32 s4, s2, 0x10001 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 -; GCN-NEXT: s_and_b32 s3, s4, s3 -; GCN-NEXT: s_xor_b32 s2, s3, s2 +; GCN-NEXT: s_andn2_b32 s2, s2, s3 +; GCN-NEXT: s_and_b32 s3, s3, 0x10001 +; GCN-NEXT: s_or_b32 s2, s3, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -417,16 +417,16 @@ entry: define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) { ; GCN-LABEL: short4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s4, 0x10001 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] ; GCN-NEXT: s_lshl_b32 s6, s6, 4 ; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] +; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] +; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -442,15 +442,15 @@ entry: define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s5, s3, 0x1010101 -; GCN-NEXT: s_lshl_b32 s6, s6, 3 -; GCN-NEXT: s_xor_b32 s4, s2, 0x1010101 -; GCN-NEXT: s_lshl_b64 s[6:7], 0xff, s6 -; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] +; GCN-NEXT: s_lshl_b32 s4, s6, 3 +; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4 +; GCN-NEXT: s_and_b32 s7, s5, 0x1010101 +; GCN-NEXT: s_and_b32 s6, s4, 0x1010101 +; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] +; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index 44bd4090436ef..be16fac4c53f7 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1511,13 +1511,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_lshl_b32 s1, s3, 4 ; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_xor_b32 s0, s2, 0x50005 -; SI-NEXT: s_lshl_b32 s1, 0xffff, s1 -; SI-NEXT: s_and_b32 s0, s0, s1 -; SI-NEXT: s_xor_b32 s0, s0, s2 +; SI-NEXT: s_lshl_b32 s0, s3, 4 +; SI-NEXT: s_lshl_b32 s0, 0xffff, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_andn2_b32 s1, s2, s0 +; SI-NEXT: s_and_b32 s0, s0, 0x50005 +; SI-NEXT: s_or_b32 s0, s0, s1 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -1528,13 +1528,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshl_b32 s1, s3, 4 ; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_xor_b32 s0, s2, 0x50005 -; VI-NEXT: s_lshl_b32 s1, 0xffff, s1 -; VI-NEXT: s_and_b32 s0, s0, s1 -; VI-NEXT: s_xor_b32 s0, s0, s2 +; VI-NEXT: s_lshl_b32 s0, s3, 4 +; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 +; VI-NEXT: s_mov_b32 s5, s1 +; VI-NEXT: s_andn2_b32 s1, s2, s0 +; VI-NEXT: s_and_b32 s0, s0, 0x50005 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1552,13 +1552,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_lshl_b32 s8, s8, 4 +; SI-NEXT: s_lshl_b32 s0, s8, 4 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_xor_b32 s1, s3, 0x50005 -; SI-NEXT: s_xor_b32 s0, s2, 0x50005 -; SI-NEXT: s_lshl_b64 s[8:9], 0xffff, s8 -; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] -; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 +; SI-NEXT: s_and_b32 s9, s1, 0x50005 +; SI-NEXT: s_and_b32 s8, s0, 0x50005 +; SI-NEXT: s_andn2_b64 s[0:1], s[2:3], s[0:1] +; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -1573,14 +1573,14 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_mov_b32 s0, 0x50005 +; VI-NEXT: s_lshl_b32 s0, s8, 4 +; VI-NEXT: s_mov_b32 s8, 0x50005 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_mov_b32 s1, s0 -; VI-NEXT: s_lshl_b32 s8, s8, 4 -; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; VI-NEXT: s_lshl_b64 s[8:9], 0xffff, s8 +; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 +; VI-NEXT: s_mov_b32 s9, s8 +; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] ; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] -; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; VI-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1594,34 +1594,35 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[8:9], 0xa -; SI-NEXT: s_load_dword s5, s[8:9], 0x13 +; SI-NEXT: s_load_dword s4, s[8:9], 0x13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dword s5, s[8:9], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b32 s6, s4, 0x505 -; SI-NEXT: s_lshl_b32 s5, s5, 3 -; SI-NEXT: s_lshl_b32 s5, 0xff, s5 -; SI-NEXT: s_and_b32 s5, s6, s5 -; SI-NEXT: s_xor_b32 s4, s5, s4 +; SI-NEXT: s_lshl_b32 s4, s4, 3 +; SI-NEXT: s_lshl_b32 s4, 0xff, s4 +; SI-NEXT: s_andn2_b32 s5, s5, s4 +; SI-NEXT: s_and_b32 s4, s4, 0x505 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[8:9], 0x28 -; VI-NEXT: s_load_dword s5, s[8:9], 0x4c +; VI-NEXT: s_load_dword s4, s[8:9], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dword s5, s[8:9], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s6, s4, 0x505 -; VI-NEXT: s_lshl_b32 s5, s5, 3 -; VI-NEXT: s_lshl_b32 s5, 0xff, s5 -; VI-NEXT: s_and_b32 s5, s6, s5 -; VI-NEXT: s_xor_b32 s4, s5, s4 +; VI-NEXT: s_lshl_b32 s4, s4, 3 +; VI-NEXT: s_lshl_b32 s4, 0xff, s4 +; VI-NEXT: s_and_b32 s6, s4, 0x505 +; VI-NEXT: s_xor_b32 s4, s4, 0xffff +; VI-NEXT: s_and_b32 s4, s4, s5 +; VI-NEXT: s_or_b32 s4, s6, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1635,17 +1636,17 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[8:9], 0xa -; SI-NEXT: s_load_dword s5, s[8:9], 0x13 +; SI-NEXT: s_load_dword s4, s[8:9], 0x13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dword s5, s[8:9], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b32 s6, s4, 0x5050505 -; SI-NEXT: s_lshl_b32 s5, s5, 3 -; SI-NEXT: s_lshl_b32 s5, 0xff, s5 -; SI-NEXT: s_and_b32 s5, s6, s5 -; SI-NEXT: s_xor_b32 s4, s5, s4 +; SI-NEXT: s_lshl_b32 s4, s4, 3 +; SI-NEXT: s_lshl_b32 s4, 0xff, s4 +; SI-NEXT: s_andn2_b32 s5, s5, s4 +; SI-NEXT: s_and_b32 s4, s4, 0x5050505 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1655,17 +1656,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 ; ; VI-LABEL: dynamic_insertelement_v3i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[8:9], 0x28 -; VI-NEXT: s_load_dword s5, s[8:9], 0x4c +; VI-NEXT: s_load_dword s4, s[8:9], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dword s5, s[8:9], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s6, s4, 0x5050505 -; VI-NEXT: s_lshl_b32 s5, s5, 3 -; VI-NEXT: s_lshl_b32 s5, 0xff, s5 -; VI-NEXT: s_and_b32 s5, s6, s5 -; VI-NEXT: s_xor_b32 s4, s5, s4 +; VI-NEXT: s_lshl_b32 s4, s4, 3 +; VI-NEXT: s_lshl_b32 s4, 0xff, s4 +; VI-NEXT: s_andn2_b32 s5, s5, s4 +; VI-NEXT: s_and_b32 s4, s4, 0x5050505 +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1680,34 +1681,34 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[8:9], 0xa -; SI-NEXT: s_load_dword s5, s[8:9], 0x13 +; SI-NEXT: s_load_dword s4, s[8:9], 0x13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; SI-NEXT: s_load_dword s5, s[8:9], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b32 s6, s4, 0x5050505 -; SI-NEXT: s_lshl_b32 s5, s5, 3 -; SI-NEXT: s_lshl_b32 s5, 0xff, s5 -; SI-NEXT: s_and_b32 s5, s6, s5 -; SI-NEXT: s_xor_b32 s4, s5, s4 +; SI-NEXT: s_lshl_b32 s4, s4, 3 +; SI-NEXT: s_lshl_b32 s4, 0xff, s4 +; SI-NEXT: s_andn2_b32 s5, s5, s4 +; SI-NEXT: s_and_b32 s4, s4, 0x5050505 +; SI-NEXT: s_or_b32 s4, s4, s5 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[8:9], 0x28 -; VI-NEXT: s_load_dword s5, s[8:9], 0x4c +; VI-NEXT: s_load_dword s4, s[8:9], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; VI-NEXT: s_load_dword s5, s[8:9], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s6, s4, 0x5050505 -; VI-NEXT: s_lshl_b32 s5, s5, 3 -; VI-NEXT: s_lshl_b32 s5, 0xff, s5 -; VI-NEXT: s_and_b32 s5, s6, s5 -; VI-NEXT: s_xor_b32 s4, s5, s4 +; VI-NEXT: s_lshl_b32 s4, s4, 3 +; VI-NEXT: s_lshl_b32 s4, 0xff, s4 +; VI-NEXT: s_andn2_b32 s5, s5, s4 +; VI-NEXT: s_and_b32 s4, s4, 0x5050505 +; VI-NEXT: s_or_b32 s4, s4, s5 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1720,20 +1721,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p ; SI-LABEL: s_dynamic_insertelement_v8i8: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; SI-NEXT: s_load_dword s8, s[8:9], 0x4 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 -; SI-NEXT: s_load_dword s8, s[8:9], 0x4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_lshl_b32 s0, s8, 3 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_lshl_b32 s8, s8, 3 +; SI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 +; SI-NEXT: s_and_b32 s9, s1, 0x5050505 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_xor_b32 s1, s3, 0x5050505 -; SI-NEXT: s_xor_b32 s0, s2, 0x5050505 -; SI-NEXT: s_lshl_b64 s[8:9], 0xff, s8 -; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] -; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; SI-NEXT: s_and_b32 s8, s0, 0x5050505 +; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -1742,20 +1743,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p ; VI-LABEL: s_dynamic_insertelement_v8i8: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 +; VI-NEXT: s_load_dword s8, s[8:9], 0x10 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 -; VI-NEXT: s_load_dword s8, s[8:9], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_lshl_b32 s0, s8, 3 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshl_b32 s8, s8, 3 +; VI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 +; VI-NEXT: s_and_b32 s9, s1, 0x5050505 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_xor_b32 s1, s3, 0x5050505 -; VI-NEXT: s_xor_b32 s0, s2, 0x5050505 -; VI-NEXT: s_lshl_b64 s[8:9], 0xff, s8 -; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] -; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; VI-NEXT: s_and_b32 s8, s0, 0x5050505 +; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index a0ad6328b0c01..e0dacb7a59a42 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1534,11 +1534,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s3, s6, 4 -; GFX9-NEXT: s_xor_b32 s2, s7, 0x3e703e7 -; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s3 -; GFX9-NEXT: s_and_b32 s2, s2, s3 -; GFX9-NEXT: s_xor_b32 s2, s2, s7 +; GFX9-NEXT: s_lshl_b32 s2, s6, 4 +; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 +; GFX9-NEXT: s_andn2_b32 s3, s7, s2 +; GFX9-NEXT: s_and_b32 s2, s2, 0x3e703e7 +; GFX9-NEXT: s_or_b32 s2, s2, s3 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -1553,14 +1553,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s1, s4, 4 -; VI-NEXT: s_xor_b32 s0, s2, 0x3e703e7 -; VI-NEXT: s_lshl_b32 s1, 0xffff, s1 -; VI-NEXT: s_and_b32 s0, s0, s1 -; VI-NEXT: s_xor_b32 s0, s0, s2 +; VI-NEXT: s_lshl_b32 s0, s4, 4 +; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 +; VI-NEXT: s_andn2_b32 s1, s2, s0 +; VI-NEXT: s_and_b32 s0, s0, 0x3e703e7 +; VI-NEXT: s_or_b32 s0, s0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -1575,14 +1575,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_mov_b32_e32 v0, s0 +; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b32 s1, s4, 4 -; CI-NEXT: s_xor_b32 s0, s2, 0x3e703e7 -; CI-NEXT: s_lshl_b32 s1, 0xffff, s1 -; CI-NEXT: s_and_b32 s0, s0, s1 -; CI-NEXT: s_xor_b32 s0, s0, s2 +; CI-NEXT: s_lshl_b32 s0, s4, 4 +; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 +; CI-NEXT: s_andn2_b32 s1, s2, s0 +; CI-NEXT: s_and_b32 s0, s0, 0x3e703e7 +; CI-NEXT: s_or_b32 s0, s0, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -1597,12 +1597,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s3, s4, 4 -; GFX11-NEXT: s_xor_b32 s4, s2, 0x3e703e7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b32 s3, 0xffff, s3 +; GFX11-NEXT: s_and_not1_b32 s2, s2, s3 +; GFX11-NEXT: s_and_b32 s3, s3, 0x3e703e7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s3, s4, s3 -; GFX11-NEXT: s_xor_b32 s2, s3, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_or_b32 s2, s3, s2 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll index 321b64510c35f..69724aa75af4f 100644 --- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll @@ -5,11 +5,10 @@ define i32 @s_out32(i32 inreg %x, i32 inreg %y, i32 inreg %mask) { ; GCN-LABEL: s_out32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %mx = and i32 %x, %mask @@ -23,11 +22,10 @@ define i64 @s_out64(i64 inreg %x, i64 inreg %y, i64 inreg %mask) { ; GCN-LABEL: s_out64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[16:17] -; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[16:17] +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GCN-NEXT: s_setpc_b64 s[30:31] %mx = and i64 %x, %mask @@ -429,11 +427,10 @@ define i32 @s_out_constant_varx_42(i32 inreg %x, i32 inreg %y, i32 inreg %mask) ; GCN-LABEL: s_out_constant_varx_42: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s0, 42 +; GCN-NEXT: s_and_b32 s0, s2, s0 +; GCN-NEXT: s_and_not1_b32 s1, 42, s2 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, 42 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %notmask = xor i32 %mask, -1 @@ -465,11 +462,10 @@ define i32 @s_out_constant_varx_42_invmask(i32 inreg %x, i32 inreg %y, i32 inreg ; GCN-LABEL: s_out_constant_varx_42_invmask: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s1, s0, 42 +; GCN-NEXT: s_and_not1_b32 s0, s0, s2 +; GCN-NEXT: s_and_b32 s1, s2, 42 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s1, s1, s2 -; GCN-NEXT: s_xor_b32 s0, s1, s0 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %notmask = xor i32 %mask, -1 @@ -564,11 +560,10 @@ define i32 @s_out_constant_42_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask) ; GCN-LABEL: s_out_constant_42_vary: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s1, 42 +; GCN-NEXT: s_and_b32 s0, s2, 42 +; GCN-NEXT: s_and_not1_b32 s1, s1, s2 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, s1 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %notmask = xor i32 %mask, -1 @@ -600,11 +595,10 @@ define i32 @s_out_constant_42_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg ; GCN-LABEL: s_out_constant_42_vary_invmask: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s0, s1, 42 +; GCN-NEXT: s_and_not1_b32 s0, 42, s2 +; GCN-NEXT: s_and_b32 s1, s2, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_xor_b32 s0, s0, 42 -; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GCN-NEXT: s_or_b32 s0, s0, s1 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %notmask = xor i32 %mask, -1 diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll index bac8bbbf0b4de..8e4c77e76029c 100644 --- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll @@ -8,16 +8,17 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-LABEL: out_v1i8( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b16 %rs<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b8 %rs1, [out_v1i8_param_0]; -; CHECK-NEXT: ld.param.b8 %rs2, [out_v1i8_param_1]; -; CHECK-NEXT: ld.param.b8 %rs3, [out_v1i8_param_2]; -; CHECK-NEXT: xor.b16 %rs4, %rs1, %rs2; -; CHECK-NEXT: and.b16 %rs5, %rs4, %rs3; -; CHECK-NEXT: xor.b16 %rs6, %rs5, %rs2; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs6; +; CHECK-NEXT: ld.param.b8 %rs2, [out_v1i8_param_2]; +; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: ld.param.b8 %rs4, [out_v1i8_param_1]; +; CHECK-NEXT: not.b16 %rs5, %rs2; +; CHECK-NEXT: and.b16 %rs6, %rs4, %rs5; +; CHECK-NEXT: or.b16 %rs7, %rs3, %rs6; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs7; ; CHECK-NEXT: ret; %mx = and <1 x i8> %x, %mask %notmask = xor <1 x i8> %mask, @@ -33,16 +34,17 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { ; CHECK-LABEL: out_v1i16( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<7>; +; CHECK-NEXT: .reg .b16 %rs<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [out_v1i16_param_0]; -; CHECK-NEXT: ld.param.b16 %rs2, [out_v1i16_param_1]; -; CHECK-NEXT: ld.param.b16 %rs3, [out_v1i16_param_2]; -; CHECK-NEXT: xor.b16 %rs4, %rs1, %rs2; -; CHECK-NEXT: and.b16 %rs5, %rs4, %rs3; -; CHECK-NEXT: xor.b16 %rs6, %rs5, %rs2; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs6; +; CHECK-NEXT: ld.param.b16 %rs2, [out_v1i16_param_2]; +; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2; +; CHECK-NEXT: ld.param.b16 %rs4, [out_v1i16_param_1]; +; CHECK-NEXT: not.b16 %rs5, %rs2; +; CHECK-NEXT: and.b16 %rs6, %rs4, %rs5; +; CHECK-NEXT: or.b16 %rs7, %rs3, %rs6; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs7; ; CHECK-NEXT: ret; %mx = and <1 x i16> %x, %mask %notmask = xor <1 x i16> %mask, @@ -124,16 +126,17 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwin define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind { ; CHECK-LABEL: out_v1i32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<7>; +; CHECK-NEXT: .reg .b32 %r<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [out_v1i32_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [out_v1i32_param_1]; -; CHECK-NEXT: ld.param.b32 %r3, [out_v1i32_param_2]; -; CHECK-NEXT: xor.b32 %r4, %r1, %r2; -; CHECK-NEXT: and.b32 %r5, %r4, %r3; -; CHECK-NEXT: xor.b32 %r6, %r5, %r2; -; CHECK-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-NEXT: ld.param.b32 %r2, [out_v1i32_param_2]; +; CHECK-NEXT: and.b32 %r3, %r1, %r2; +; CHECK-NEXT: ld.param.b32 %r4, [out_v1i32_param_1]; +; CHECK-NEXT: not.b32 %r5, %r2; +; CHECK-NEXT: and.b32 %r6, %r4, %r5; +; CHECK-NEXT: or.b32 %r7, %r3, %r6; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-NEXT: ret; %mx = and <1 x i32> %x, %mask %notmask = xor <1 x i32> %mask, @@ -227,19 +230,21 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind { ; CHECK-LABEL: out_v2i32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b32 %r<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [out_v2i32_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_1]; -; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [out_v2i32_param_2]; -; CHECK-NEXT: xor.b32 %r7, %r2, %r4; -; CHECK-NEXT: and.b32 %r8, %r7, %r6; -; CHECK-NEXT: xor.b32 %r9, %r8, %r4; -; CHECK-NEXT: xor.b32 %r10, %r1, %r3; -; CHECK-NEXT: and.b32 %r11, %r10, %r5; -; CHECK-NEXT: xor.b32 %r12, %r11, %r3; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r9}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_2]; +; CHECK-NEXT: and.b32 %r5, %r1, %r3; +; CHECK-NEXT: and.b32 %r6, %r2, %r4; +; CHECK-NEXT: ld.param.v2.b32 {%r7, %r8}, [out_v2i32_param_1]; +; CHECK-NEXT: not.b32 %r9, %r4; +; CHECK-NEXT: not.b32 %r10, %r3; +; CHECK-NEXT: and.b32 %r11, %r7, %r10; +; CHECK-NEXT: and.b32 %r12, %r8, %r9; +; CHECK-NEXT: or.b32 %r13, %r6, %r12; +; CHECK-NEXT: or.b32 %r14, %r5, %r11; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r13}; ; CHECK-NEXT: ret; %mx = and <2 x i32> %x, %mask %notmask = xor <2 x i32> %mask, @@ -251,16 +256,17 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwin define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind { ; CHECK-LABEL: out_v1i64( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-NEXT: .reg .b64 %rd<8>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [out_v1i64_param_0]; -; CHECK-NEXT: ld.param.b64 %rd2, [out_v1i64_param_1]; -; CHECK-NEXT: ld.param.b64 %rd3, [out_v1i64_param_2]; -; CHECK-NEXT: xor.b64 %rd4, %rd1, %rd2; -; CHECK-NEXT: and.b64 %rd5, %rd4, %rd3; -; CHECK-NEXT: xor.b64 %rd6, %rd5, %rd2; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; +; CHECK-NEXT: ld.param.b64 %rd2, [out_v1i64_param_2]; +; CHECK-NEXT: and.b64 %rd3, %rd1, %rd2; +; CHECK-NEXT: ld.param.b64 %rd4, [out_v1i64_param_1]; +; CHECK-NEXT: not.b64 %rd5, %rd2; +; CHECK-NEXT: and.b64 %rd6, %rd4, %rd5; +; CHECK-NEXT: or.b64 %rd7, %rd3, %rd6; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd7; ; CHECK-NEXT: ret; %mx = and <1 x i64> %x, %mask %notmask = xor <1 x i64> %mask, @@ -344,25 +350,29 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { ; CHECK-LABEL: out_v4i32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<25>; +; CHECK-NEXT: .reg .b32 %r<29>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_param_0]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_1]; -; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [out_v4i32_param_2]; -; CHECK-NEXT: xor.b32 %r13, %r4, %r8; -; CHECK-NEXT: and.b32 %r14, %r13, %r12; -; CHECK-NEXT: xor.b32 %r15, %r14, %r8; -; CHECK-NEXT: xor.b32 %r16, %r3, %r7; -; CHECK-NEXT: and.b32 %r17, %r16, %r11; -; CHECK-NEXT: xor.b32 %r18, %r17, %r7; -; CHECK-NEXT: xor.b32 %r19, %r2, %r6; -; CHECK-NEXT: and.b32 %r20, %r19, %r10; -; CHECK-NEXT: xor.b32 %r21, %r20, %r6; -; CHECK-NEXT: xor.b32 %r22, %r1, %r5; -; CHECK-NEXT: and.b32 %r23, %r22, %r9; -; CHECK-NEXT: xor.b32 %r24, %r23, %r5; -; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r24, %r21, %r18, %r15}; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_2]; +; CHECK-NEXT: and.b32 %r9, %r1, %r5; +; CHECK-NEXT: and.b32 %r10, %r2, %r6; +; CHECK-NEXT: and.b32 %r11, %r3, %r7; +; CHECK-NEXT: and.b32 %r12, %r4, %r8; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_param_1]; +; CHECK-NEXT: not.b32 %r17, %r8; +; CHECK-NEXT: not.b32 %r18, %r7; +; CHECK-NEXT: not.b32 %r19, %r6; +; CHECK-NEXT: not.b32 %r20, %r5; +; CHECK-NEXT: and.b32 %r21, %r13, %r20; +; CHECK-NEXT: and.b32 %r22, %r14, %r19; +; CHECK-NEXT: and.b32 %r23, %r15, %r18; +; CHECK-NEXT: and.b32 %r24, %r16, %r17; +; CHECK-NEXT: or.b32 %r25, %r12, %r24; +; CHECK-NEXT: or.b32 %r26, %r11, %r23; +; CHECK-NEXT: or.b32 %r27, %r10, %r22; +; CHECK-NEXT: or.b32 %r28, %r9, %r21; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r28, %r27, %r26, %r25}; ; CHECK-NEXT: ret; %mx = and <4 x i32> %x, %mask %notmask = xor <4 x i32> %mask, @@ -374,23 +384,26 @@ define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwin define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { ; CHECK-LABEL: out_v4i32_undef( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<23>; +; CHECK-NEXT: .reg .b32 %r<26>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_undef_param_0]; ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_undef_param_2]; ; CHECK-NEXT: and.b32 %r9, %r3, %r7; -; CHECK-NEXT: ld.param.v4.b32 {%r10, %r11, %r12, %r13}, [out_v4i32_undef_param_1]; -; CHECK-NEXT: xor.b32 %r14, %r4, %r13; -; CHECK-NEXT: and.b32 %r15, %r14, %r8; -; CHECK-NEXT: xor.b32 %r16, %r15, %r13; -; CHECK-NEXT: xor.b32 %r17, %r2, %r11; -; CHECK-NEXT: and.b32 %r18, %r17, %r6; -; CHECK-NEXT: xor.b32 %r19, %r18, %r11; -; CHECK-NEXT: xor.b32 %r20, %r1, %r10; -; CHECK-NEXT: and.b32 %r21, %r20, %r5; -; CHECK-NEXT: xor.b32 %r22, %r21, %r10; -; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r22, %r19, %r9, %r16}; +; CHECK-NEXT: and.b32 %r10, %r1, %r5; +; CHECK-NEXT: and.b32 %r11, %r2, %r6; +; CHECK-NEXT: and.b32 %r12, %r4, %r8; +; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_undef_param_1]; +; CHECK-NEXT: not.b32 %r17, %r8; +; CHECK-NEXT: not.b32 %r18, %r6; +; CHECK-NEXT: not.b32 %r19, %r5; +; CHECK-NEXT: and.b32 %r20, %r13, %r19; +; CHECK-NEXT: and.b32 %r21, %r14, %r18; +; CHECK-NEXT: and.b32 %r22, %r16, %r17; +; CHECK-NEXT: or.b32 %r23, %r12, %r22; +; CHECK-NEXT: or.b32 %r24, %r11, %r21; +; CHECK-NEXT: or.b32 %r25, %r10, %r20; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r25, %r24, %r9, %r23}; ; CHECK-NEXT: ret; %mx = and <4 x i32> %x, %mask %notmask = xor <4 x i32> %mask, @@ -402,19 +415,21 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) n define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind { ; CHECK-LABEL: out_v2i64( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<13>; +; CHECK-NEXT: .reg .b64 %rd<15>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [out_v2i64_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_1]; -; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [out_v2i64_param_2]; -; CHECK-NEXT: xor.b64 %rd7, %rd2, %rd4; -; CHECK-NEXT: and.b64 %rd8, %rd7, %rd6; -; CHECK-NEXT: xor.b64 %rd9, %rd8, %rd4; -; CHECK-NEXT: xor.b64 %rd10, %rd1, %rd3; -; CHECK-NEXT: and.b64 %rd11, %rd10, %rd5; -; CHECK-NEXT: xor.b64 %rd12, %rd11, %rd3; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd12, %rd9}; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_2]; +; CHECK-NEXT: and.b64 %rd5, %rd1, %rd3; +; CHECK-NEXT: and.b64 %rd6, %rd2, %rd4; +; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [out_v2i64_param_1]; +; CHECK-NEXT: not.b64 %rd9, %rd4; +; CHECK-NEXT: not.b64 %rd10, %rd3; +; CHECK-NEXT: and.b64 %rd11, %rd7, %rd10; +; CHECK-NEXT: and.b64 %rd12, %rd8, %rd9; +; CHECK-NEXT: or.b64 %rd13, %rd6, %rd12; +; CHECK-NEXT: or.b64 %rd14, %rd5, %rd11; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd14, %rd13}; ; CHECK-NEXT: ret; %mx = and <2 x i64> %x, %mask %notmask = xor <2 x i64> %mask, diff --git a/llvm/test/CodeGen/RISCV/fold-masked-merge.ll b/llvm/test/CodeGen/RISCV/fold-masked-merge.ll deleted file mode 100644 index 631b7109281e5..0000000000000 --- a/llvm/test/CodeGen/RISCV/fold-masked-merge.ll +++ /dev/null @@ -1,302 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=riscv32 < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV32,RV32I -; RUN: llc -mtriple=riscv64 < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV64,RV64I -; RUN: llc -mtriple=riscv32 -mattr=+zbb < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV32,RV32ZBB -; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \ -; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB -; -; test that masked-merge code is generated as "xor;and;xor" sequence or -; "andn ; and; or" if and-not is available. - -define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) { -; CHECK-I-LABEL: masked_merge0: -; CHECK-I: # %bb.0: -; CHECK-I-NEXT: xor a1, a1, a2 -; CHECK-I-NEXT: and a0, a1, a0 -; CHECK-I-NEXT: xor a0, a0, a2 -; CHECK-I-NEXT: ret -; -; CHECK-ZBB-LABEL: masked_merge0: -; CHECK-ZBB: # %bb.0: -; CHECK-ZBB-NEXT: and a1, a0, a1 -; CHECK-ZBB-NEXT: andn a0, a2, a0 -; CHECK-ZBB-NEXT: or a0, a1, a0 -; CHECK-ZBB-NEXT: ret - %and0 = and i32 %a0, %a1 - %not = xor i32 %a0, -1 - %and1 = and i32 %not, %a2 - %or = or i32 %and0, %and1 - ret i32 %or -} - -define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) { -; CHECK-I-LABEL: masked_merge1: -; CHECK-I: # %bb.0: -; CHECK-I-NEXT: xor a1, a1, a2 -; CHECK-I-NEXT: and a0, a1, a0 -; CHECK-I-NEXT: xor a0, a0, a2 -; CHECK-I-NEXT: ret -; -; CHECK-ZBB-LABEL: masked_merge1: -; CHECK-ZBB: # %bb.0: -; CHECK-ZBB-NEXT: and a1, a0, a1 -; CHECK-ZBB-NEXT: andn a0, a2, a0 -; CHECK-ZBB-NEXT: or a0, a1, a0 -; CHECK-ZBB-NEXT: ret - %and0 = and i16 %a0, %a1 - %not = xor i16 %a0, -1 - %and1 = and i16 %a2, %not - %or = or i16 %and0, %and1 - ret i16 %or -} - -define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) { -; CHECK-I-LABEL: masked_merge2: -; CHECK-I: # %bb.0: -; CHECK-I-NEXT: mv a0, a1 -; CHECK-I-NEXT: ret -; -; CHECK-ZBB-LABEL: masked_merge2: -; CHECK-ZBB: # %bb.0: -; CHECK-ZBB-NEXT: andn a2, a1, a0 -; CHECK-ZBB-NEXT: and a0, a1, a0 -; CHECK-ZBB-NEXT: or a0, a2, a0 -; CHECK-ZBB-NEXT: ret - %not = xor i8 %a0, -1 - %and0 = and i8 %not, %a1 - %and1 = and i8 %a1, %a0 - %or = or i8 %and0, %and1 - ret i8 %or -} - -define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) { -; RV32I-LABEL: masked_merge3: -; RV32I: # %bb.0: -; RV32I-NEXT: not a5, a5 -; RV32I-NEXT: not a4, a4 -; RV32I-NEXT: xor a3, a3, a5 -; RV32I-NEXT: xor a2, a2, a4 -; RV32I-NEXT: not a2, a2 -; RV32I-NEXT: not a3, a3 -; RV32I-NEXT: and a0, a2, a0 -; RV32I-NEXT: and a1, a3, a1 -; RV32I-NEXT: xor a0, a0, a4 -; RV32I-NEXT: xor a1, a1, a5 -; RV32I-NEXT: ret -; -; RV64I-LABEL: masked_merge3: -; RV64I: # %bb.0: -; RV64I-NEXT: not a2, a2 -; RV64I-NEXT: xor a1, a1, a2 -; RV64I-NEXT: not a1, a1 -; RV64I-NEXT: and a0, a1, a0 -; RV64I-NEXT: xor a0, a0, a2 -; RV64I-NEXT: ret -; -; RV32ZBB-LABEL: masked_merge3: -; RV32ZBB: # %bb.0: -; RV32ZBB-NEXT: not a6, a0 -; RV32ZBB-NEXT: not a7, a1 -; RV32ZBB-NEXT: andn a1, a1, a3 -; RV32ZBB-NEXT: andn a0, a0, a2 -; RV32ZBB-NEXT: andn a2, a7, a5 -; RV32ZBB-NEXT: andn a3, a6, a4 -; RV32ZBB-NEXT: or a0, a3, a0 -; RV32ZBB-NEXT: or a1, a2, a1 -; RV32ZBB-NEXT: ret -; -; RV64ZBB-LABEL: masked_merge3: -; RV64ZBB: # %bb.0: -; RV64ZBB-NEXT: not a3, a0 -; RV64ZBB-NEXT: andn a2, a3, a2 -; RV64ZBB-NEXT: andn a0, a0, a1 -; RV64ZBB-NEXT: or a0, a2, a0 -; RV64ZBB-NEXT: ret - %v0 = xor i64 %a1, -1 - %v1 = xor i64 %a2, -1 - %not = xor i64 %a0, -1 - %and0 = and i64 %not, %v1 - %and1 = and i64 %v0, %a0 - %or = or i64 %and0, %and1 - ret i64 %or -} - -define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) { -; RV32-LABEL: not_a_masked_merge0: -; RV32: # %bb.0: -; RV32-NEXT: and a1, a0, a1 -; RV32-NEXT: neg a0, a0 -; RV32-NEXT: and a0, a0, a2 -; RV32-NEXT: or a0, a1, a0 -; RV32-NEXT: ret -; -; RV64-LABEL: not_a_masked_merge0: -; RV64: # %bb.0: -; RV64-NEXT: and a1, a0, a1 -; RV64-NEXT: negw a0, a0 -; RV64-NEXT: and a0, a0, a2 -; RV64-NEXT: or a0, a1, a0 -; RV64-NEXT: ret - %and0 = and i32 %a0, %a1 - %not_a_not = sub i32 0, %a0 - %and1 = and i32 %not_a_not, %a2 - %or = or i32 %and0, %and1 - ret i32 %or -} - -define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { -; CHECK-I-LABEL: not_a_masked_merge1: -; CHECK-I: # %bb.0: -; CHECK-I-NEXT: and a0, a0, a1 -; CHECK-I-NEXT: not a1, a3 -; CHECK-I-NEXT: and a1, a1, a2 -; CHECK-I-NEXT: or a0, a0, a1 -; CHECK-I-NEXT: ret -; -; CHECK-ZBB-LABEL: not_a_masked_merge1: -; CHECK-ZBB: # %bb.0: -; CHECK-ZBB-NEXT: and a0, a0, a1 -; CHECK-ZBB-NEXT: andn a1, a2, a3 -; CHECK-ZBB-NEXT: or a0, a0, a1 -; CHECK-ZBB-NEXT: ret - %and0 = and i32 %a0, %a1 - %not = xor i32 %a3, -1 - %and1 = and i32 %not, %a2 - %or = or i32 %and0, %and1 - ret i32 %or -} - -define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) { -; CHECK-I-LABEL: not_a_masked_merge2: -; CHECK-I: # %bb.0: -; CHECK-I-NEXT: or a1, a0, a1 -; CHECK-I-NEXT: not a0, a0 -; CHECK-I-NEXT: and a0, a0, a2 -; CHECK-I-NEXT: or a0, a1, a0 -; CHECK-I-NEXT: ret -; -; CHECK-ZBB-LABEL: not_a_masked_merge2: -; CHECK-ZBB: # %bb.0: -; CHECK-ZBB-NEXT: or a1, a0, a1 -; CHECK-ZBB-NEXT: andn a0, a2, a0 -; CHECK-ZBB-NEXT: or a0, a1, a0 -; CHECK-ZBB-NEXT: ret - %not_an_and0 = or i32 %a0, %a1 - %not = xor i32 %a0, -1 - %and1 = and i32 %not, %a2 - %or = or i32 %not_an_and0, %and1 - ret i32 %or -} - -define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) { -; CHECK-I-LABEL: not_a_masked_merge3: -; CHECK-I: # %bb.0: -; CHECK-I-NEXT: and a1, a0, a1 -; CHECK-I-NEXT: xor a0, a0, a2 -; CHECK-I-NEXT: not a0, a0 -; CHECK-I-NEXT: or a0, a1, a0 -; CHECK-I-NEXT: ret -; -; CHECK-ZBB-LABEL: not_a_masked_merge3: -; CHECK-ZBB: # %bb.0: -; CHECK-ZBB-NEXT: and a1, a0, a1 -; CHECK-ZBB-NEXT: xor a0, a0, a2 -; CHECK-ZBB-NEXT: orn a0, a1, a0 -; CHECK-ZBB-NEXT: ret - %and0 = and i32 %a0, %a1 - %not = xor i32 %a0, -1 - %not_an_and1 = xor i32 %not, %a2 - %or = or i32 %and0, %not_an_and1 - ret i32 %or -} - -define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) { -; CHECK-LABEL: not_a_masked_merge4: -; CHECK: # %bb.0: -; CHECK-NEXT: and a0, a0, a1 -; CHECK-NEXT: ret - %and0 = and i32 %a0, %a1 - %not = xor i32 %a2, -1 - %and1 = and i32 %not, %a2 - %or = or i32 %and0, %and1 - ret i32 %or -} - -define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) { -; CHECK-I-LABEL: masked_merge_no_transform0: -; CHECK-I: # %bb.0: -; CHECK-I-NEXT: and a1, a0, a1 -; CHECK-I-NEXT: not a0, a0 -; CHECK-I-NEXT: and a0, a0, a2 -; CHECK-I-NEXT: or a0, a1, a0 -; CHECK-I-NEXT: sw a1, 0(a3) -; CHECK-I-NEXT: ret -; -; CHECK-ZBB-LABEL: masked_merge_no_transform0: -; CHECK-ZBB: # %bb.0: -; CHECK-ZBB-NEXT: and a1, a0, a1 -; CHECK-ZBB-NEXT: andn a0, a2, a0 -; CHECK-ZBB-NEXT: or a0, a1, a0 -; CHECK-ZBB-NEXT: sw a1, 0(a3) -; CHECK-ZBB-NEXT: ret - %and0 = and i32 %a0, %a1 - %not = xor i32 %a0, -1 - %and1 = and i32 %not, %a2 - %or = or i32 %and0, %and1 - store i32 %and0, ptr %p1 - ret i32 %or -} - -define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) { -; CHECK-I-LABEL: masked_merge_no_transform1: -; CHECK-I: # %bb.0: -; CHECK-I-NEXT: and a1, a0, a1 -; CHECK-I-NEXT: not a4, a0 -; CHECK-I-NEXT: and a0, a4, a2 -; CHECK-I-NEXT: or a0, a1, a0 -; CHECK-I-NEXT: sw a4, 0(a3) -; CHECK-I-NEXT: ret -; -; CHECK-ZBB-LABEL: masked_merge_no_transform1: -; CHECK-ZBB: # %bb.0: -; CHECK-ZBB-NEXT: and a1, a0, a1 -; CHECK-ZBB-NEXT: not a4, a0 -; CHECK-ZBB-NEXT: andn a0, a2, a0 -; CHECK-ZBB-NEXT: or a0, a1, a0 -; CHECK-ZBB-NEXT: sw a4, 0(a3) -; CHECK-ZBB-NEXT: ret - %and0 = and i32 %a0, %a1 - %not = xor i32 %a0, -1 - %and1 = and i32 %not, %a2 - %or = or i32 %and0, %and1 - store i32 %not, ptr %p1 - ret i32 %or -} - -define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) { -; CHECK-I-LABEL: masked_merge_no_transform2: -; CHECK-I: # %bb.0: -; CHECK-I-NEXT: and a1, a0, a1 -; CHECK-I-NEXT: not a0, a0 -; CHECK-I-NEXT: and a2, a0, a2 -; CHECK-I-NEXT: or a0, a1, a2 -; CHECK-I-NEXT: sw a2, 0(a3) -; CHECK-I-NEXT: ret -; -; CHECK-ZBB-LABEL: masked_merge_no_transform2: -; CHECK-ZBB: # %bb.0: -; CHECK-ZBB-NEXT: and a1, a0, a1 -; CHECK-ZBB-NEXT: andn a2, a2, a0 -; CHECK-ZBB-NEXT: or a0, a1, a2 -; CHECK-ZBB-NEXT: sw a2, 0(a3) -; CHECK-ZBB-NEXT: ret - %and0 = and i32 %a0, %a1 - %not = xor i32 %a0, -1 - %and1 = and i32 %not, %a2 - %or = or i32 %and0, %and1 - store i32 %and1, ptr %p1 - ret i32 %or -} diff --git a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll index efc8243df71e0..1517e524a7f78 100644 --- a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll @@ -8,13 +8,16 @@ ; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \ ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB +; TODO: Should we convert these to X ^ ((X ^ Y) & M) form when Zbb isn't +; present? define i8 @out8(i8 %x, i8 %y, i8 %mask) { ; CHECK-I-LABEL: out8: ; CHECK-I: # %bb.0: -; CHECK-I-NEXT: xor a0, a0, a1 ; CHECK-I-NEXT: and a0, a0, a2 -; CHECK-I-NEXT: xor a0, a0, a1 +; CHECK-I-NEXT: not a2, a2 +; CHECK-I-NEXT: and a1, a1, a2 +; CHECK-I-NEXT: or a0, a0, a1 ; CHECK-I-NEXT: ret ; ; CHECK-ZBB-LABEL: out8: @@ -33,9 +36,10 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) { define i16 @out16(i16 %x, i16 %y, i16 %mask) { ; CHECK-I-LABEL: out16: ; CHECK-I: # %bb.0: -; CHECK-I-NEXT: xor a0, a0, a1 ; CHECK-I-NEXT: and a0, a0, a2 -; CHECK-I-NEXT: xor a0, a0, a1 +; CHECK-I-NEXT: not a2, a2 +; CHECK-I-NEXT: and a1, a1, a2 +; CHECK-I-NEXT: or a0, a0, a1 ; CHECK-I-NEXT: ret ; ; CHECK-ZBB-LABEL: out16: @@ -54,9 +58,10 @@ define i16 @out16(i16 %x, i16 %y, i16 %mask) { define i32 @out32(i32 %x, i32 %y, i32 %mask) { ; CHECK-I-LABEL: out32: ; CHECK-I: # %bb.0: -; CHECK-I-NEXT: xor a0, a0, a1 ; CHECK-I-NEXT: and a0, a0, a2 -; CHECK-I-NEXT: xor a0, a0, a1 +; CHECK-I-NEXT: not a2, a2 +; CHECK-I-NEXT: and a1, a1, a2 +; CHECK-I-NEXT: or a0, a0, a1 ; CHECK-I-NEXT: ret ; ; CHECK-ZBB-LABEL: out32: @@ -75,19 +80,22 @@ define i32 @out32(i32 %x, i32 %y, i32 %mask) { define i64 @out64(i64 %x, i64 %y, i64 %mask) { ; RV32I-LABEL: out64: ; RV32I: # %bb.0: -; RV32I-NEXT: xor a0, a0, a2 -; RV32I-NEXT: xor a1, a1, a3 -; RV32I-NEXT: and a0, a0, a4 ; RV32I-NEXT: and a1, a1, a5 -; RV32I-NEXT: xor a0, a0, a2 -; RV32I-NEXT: xor a1, a1, a3 +; RV32I-NEXT: and a0, a0, a4 +; RV32I-NEXT: not a4, a4 +; RV32I-NEXT: not a5, a5 +; RV32I-NEXT: and a3, a3, a5 +; RV32I-NEXT: and a2, a2, a4 +; RV32I-NEXT: or a0, a0, a2 +; RV32I-NEXT: or a1, a1, a3 ; RV32I-NEXT: ret ; ; RV64I-LABEL: out64: ; RV64I: # %bb.0: -; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: xor a0, a0, a1 +; RV64I-NEXT: not a2, a2 +; RV64I-NEXT: and a1, a1, a2 +; RV64I-NEXT: or a0, a0, a1 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: out64: @@ -652,9 +660,10 @@ define i32 @in_constant_varx_mone_invmask(i32 %x, i32 %y, i32 %mask) { define i32 @out_constant_varx_42(i32 %x, i32 %y, i32 %mask) { ; CHECK-I-LABEL: out_constant_varx_42: ; CHECK-I: # %bb.0: -; CHECK-I-NEXT: xori a0, a0, 42 -; CHECK-I-NEXT: and a0, a0, a2 -; CHECK-I-NEXT: xori a0, a0, 42 +; CHECK-I-NEXT: not a1, a2 +; CHECK-I-NEXT: and a0, a2, a0 +; CHECK-I-NEXT: andi a1, a1, 42 +; CHECK-I-NEXT: or a0, a0, a1 ; CHECK-I-NEXT: ret ; ; CHECK-ZBB-LABEL: out_constant_varx_42: @@ -695,9 +704,10 @@ define i32 @in_constant_varx_42(i32 %x, i32 %y, i32 %mask) { define i32 @out_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) { ; CHECK-I-LABEL: out_constant_varx_42_invmask: ; CHECK-I: # %bb.0: -; CHECK-I-NEXT: xori a1, a0, 42 -; CHECK-I-NEXT: and a1, a1, a2 -; CHECK-I-NEXT: xor a0, a1, a0 +; CHECK-I-NEXT: not a1, a2 +; CHECK-I-NEXT: and a0, a1, a0 +; CHECK-I-NEXT: andi a1, a2, 42 +; CHECK-I-NEXT: or a0, a0, a1 ; CHECK-I-NEXT: ret ; ; CHECK-ZBB-LABEL: out_constant_varx_42_invmask: @@ -802,9 +812,10 @@ define i32 @in_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) { define i32 @out_constant_42_vary(i32 %x, i32 %y, i32 %mask) { ; CHECK-I-LABEL: out_constant_42_vary: ; CHECK-I: # %bb.0: -; CHECK-I-NEXT: xori a0, a1, 42 -; CHECK-I-NEXT: and a0, a0, a2 -; CHECK-I-NEXT: xor a0, a0, a1 +; CHECK-I-NEXT: not a0, a2 +; CHECK-I-NEXT: andi a2, a2, 42 +; CHECK-I-NEXT: and a0, a0, a1 +; CHECK-I-NEXT: or a0, a2, a0 ; CHECK-I-NEXT: ret ; ; CHECK-ZBB-LABEL: out_constant_42_vary: @@ -844,9 +855,10 @@ define i32 @in_constant_42_vary(i32 %x, i32 %y, i32 %mask) { define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) { ; CHECK-I-LABEL: out_constant_42_vary_invmask: ; CHECK-I: # %bb.0: -; CHECK-I-NEXT: xori a0, a1, 42 -; CHECK-I-NEXT: and a0, a0, a2 -; CHECK-I-NEXT: xori a0, a0, 42 +; CHECK-I-NEXT: not a0, a2 +; CHECK-I-NEXT: andi a0, a0, 42 +; CHECK-I-NEXT: and a1, a2, a1 +; CHECK-I-NEXT: or a0, a0, a1 ; CHECK-I-NEXT: ret ; ; CHECK-ZBB-LABEL: out_constant_42_vary_invmask: diff --git a/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll b/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll deleted file mode 100644 index c014345507f69..0000000000000 --- a/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll +++ /dev/null @@ -1,277 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s --check-prefix=NO-MISC3 -; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s --check-prefix=MISC3 - -; test that masked-merge code is generated as "xor;and;xor" sequence or -; "andn ; and; or" if and-not is available. - -define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) { -; NO-MISC3-LABEL: masked_merge0: -; NO-MISC3: # %bb.0: -; NO-MISC3-NEXT: xr %r3, %r4 -; NO-MISC3-NEXT: nr %r2, %r3 -; NO-MISC3-NEXT: xr %r2, %r4 -; NO-MISC3-NEXT: br %r14 -; -; MISC3-LABEL: masked_merge0: -; MISC3: # %bb.0: -; MISC3-NEXT: nr %r3, %r2 -; MISC3-NEXT: ncrk %r2, %r4, %r2 -; MISC3-NEXT: or %r2, %r3 -; MISC3-NEXT: br %r14 - %and0 = and i32 %a0, %a1 - %not = xor i32 %a0, -1 - %and1 = and i32 %not, %a2 - %or = or i32 %and0, %and1 - ret i32 %or -} - -define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) { -; NO-MISC3-LABEL: masked_merge1: -; NO-MISC3: # %bb.0: -; NO-MISC3-NEXT: xr %r3, %r4 -; NO-MISC3-NEXT: nr %r2, %r3 -; NO-MISC3-NEXT: xr %r2, %r4 -; NO-MISC3-NEXT: br %r14 -; -; MISC3-LABEL: masked_merge1: -; MISC3: # %bb.0: -; MISC3-NEXT: ncrk %r0, %r4, %r2 -; MISC3-NEXT: nr %r2, %r3 -; MISC3-NEXT: or %r2, %r0 -; MISC3-NEXT: br %r14 - %and0 = and i16 %a0, %a1 - %not = xor i16 %a0, -1 - %and1 = and i16 %a2, %not - %or = or i16 %and0, %and1 - ret i16 %or -} - -define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) { -; NO-MISC3-LABEL: masked_merge2: -; NO-MISC3: # %bb.0: -; NO-MISC3-NEXT: lr %r2, %r3 -; NO-MISC3-NEXT: br %r14 -; -; MISC3-LABEL: masked_merge2: -; MISC3: # %bb.0: -; MISC3-NEXT: lr %r2, %r3 -; MISC3-NEXT: br %r14 - %not = xor i8 %a0, -1 - %and0 = and i8 %not, %a1 - %and1 = and i8 %a1, %a0 - %or = or i8 %and0, %and1 - ret i8 %or -} - -define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) { -; NO-MISC3-LABEL: masked_merge3: -; NO-MISC3: # %bb.0: -; NO-MISC3-NEXT: lcgr %r0, %r4 -; NO-MISC3-NEXT: aghi %r0, -1 -; NO-MISC3-NEXT: xgr %r3, %r0 -; NO-MISC3-NEXT: ngr %r3, %r2 -; NO-MISC3-NEXT: xgr %r3, %r2 -; NO-MISC3-NEXT: xgrk %r2, %r3, %r0 -; NO-MISC3-NEXT: br %r14 -; -; MISC3-LABEL: masked_merge3: -; MISC3: # %bb.0: -; MISC3-NEXT: lcgr %r0, %r2 -; MISC3-NEXT: aghi %r0, -1 -; MISC3-NEXT: ncgrk %r0, %r0, %r4 -; MISC3-NEXT: ncgrk %r2, %r2, %r3 -; MISC3-NEXT: ogr %r2, %r0 -; MISC3-NEXT: br %r14 - %v0 = xor i64 %a1, -1 - %v1 = xor i64 %a2, -1 - %not = xor i64 %a0, -1 - %and0 = and i64 %not, %v1 - %and1 = and i64 %v0, %a0 - %or = or i64 %and0, %and1 - ret i64 %or -} - -define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) { -; NO-MISC3-LABEL: not_a_masked_merge0: -; NO-MISC3: # %bb.0: -; NO-MISC3-NEXT: lcr %r0, %r2 -; NO-MISC3-NEXT: nr %r3, %r2 -; NO-MISC3-NEXT: nr %r0, %r4 -; NO-MISC3-NEXT: ork %r2, %r3, %r0 -; NO-MISC3-NEXT: br %r14 -; -; MISC3-LABEL: not_a_masked_merge0: -; MISC3: # %bb.0: -; MISC3-NEXT: lcr %r0, %r2 -; MISC3-NEXT: nr %r3, %r2 -; MISC3-NEXT: nr %r0, %r4 -; MISC3-NEXT: ork %r2, %r3, %r0 -; MISC3-NEXT: br %r14 - %and0 = and i32 %a0, %a1 - %not_a_not = sub i32 0, %a0 - %and1 = and i32 %not_a_not, %a2 - %or = or i32 %and0, %and1 - ret i32 %or -} - -define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { -; NO-MISC3-LABEL: not_a_masked_merge1: -; NO-MISC3: # %bb.0: -; NO-MISC3-NEXT: xilf %r5, 4294967295 -; NO-MISC3-NEXT: nr %r2, %r3 -; NO-MISC3-NEXT: nr %r4, %r5 -; NO-MISC3-NEXT: or %r2, %r4 -; NO-MISC3-NEXT: br %r14 -; -; MISC3-LABEL: not_a_masked_merge1: -; MISC3: # %bb.0: -; MISC3-NEXT: nr %r2, %r3 -; MISC3-NEXT: ncrk %r0, %r4, %r5 -; MISC3-NEXT: or %r2, %r0 -; MISC3-NEXT: br %r14 - %and0 = and i32 %a0, %a1 - %not = xor i32 %a3, -1 - %and1 = and i32 %not, %a2 - %or = or i32 %and0, %and1 - ret i32 %or -} - -define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) { -; NO-MISC3-LABEL: not_a_masked_merge2: -; NO-MISC3: # %bb.0: -; NO-MISC3-NEXT: or %r3, %r2 -; NO-MISC3-NEXT: xilf %r2, 4294967295 -; NO-MISC3-NEXT: nr %r2, %r4 -; NO-MISC3-NEXT: or %r2, %r3 -; NO-MISC3-NEXT: br %r14 -; -; MISC3-LABEL: not_a_masked_merge2: -; MISC3: # %bb.0: -; MISC3-NEXT: or %r3, %r2 -; MISC3-NEXT: ncrk %r2, %r4, %r2 -; MISC3-NEXT: or %r2, %r3 -; MISC3-NEXT: br %r14 - %not_an_and0 = or i32 %a0, %a1 - %not = xor i32 %a0, -1 - %and1 = and i32 %not, %a2 - %or = or i32 %not_an_and0, %and1 - ret i32 %or -} - -define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) { -; NO-MISC3-LABEL: not_a_masked_merge3: -; NO-MISC3: # %bb.0: -; NO-MISC3-NEXT: nr %r3, %r2 -; NO-MISC3-NEXT: xr %r2, %r4 -; NO-MISC3-NEXT: xilf %r2, 4294967295 -; NO-MISC3-NEXT: or %r2, %r3 -; NO-MISC3-NEXT: br %r14 -; -; MISC3-LABEL: not_a_masked_merge3: -; MISC3: # %bb.0: -; MISC3-NEXT: nr %r3, %r2 -; MISC3-NEXT: xr %r2, %r4 -; MISC3-NEXT: ocrk %r2, %r3, %r2 -; MISC3-NEXT: br %r14 - %and0 = and i32 %a0, %a1 - %not = xor i32 %a0, -1 - %not_an_and1 = xor i32 %not, %a2 - %or = or i32 %and0, %not_an_and1 - ret i32 %or -} - -define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) { -; NO-MISC3-LABEL: not_a_masked_merge4: -; NO-MISC3: # %bb.0: -; NO-MISC3-NEXT: nr %r2, %r3 -; NO-MISC3-NEXT: br %r14 -; -; MISC3-LABEL: not_a_masked_merge4: -; MISC3: # %bb.0: -; MISC3-NEXT: nr %r2, %r3 -; MISC3-NEXT: br %r14 - %and0 = and i32 %a0, %a1 - %not = xor i32 %a2, -1 - %and1 = and i32 %not, %a2 - %or = or i32 %and0, %and1 - ret i32 %or -} - -define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) { -; NO-MISC3-LABEL: masked_merge_no_transform0: -; NO-MISC3: # %bb.0: -; NO-MISC3-NEXT: nr %r3, %r2 -; NO-MISC3-NEXT: xilf %r2, 4294967295 -; NO-MISC3-NEXT: nr %r2, %r4 -; NO-MISC3-NEXT: or %r2, %r3 -; NO-MISC3-NEXT: st %r3, 0(%r5) -; NO-MISC3-NEXT: br %r14 -; -; MISC3-LABEL: masked_merge_no_transform0: -; MISC3: # %bb.0: -; MISC3-NEXT: nr %r3, %r2 -; MISC3-NEXT: ncrk %r2, %r4, %r2 -; MISC3-NEXT: or %r2, %r3 -; MISC3-NEXT: st %r3, 0(%r5) -; MISC3-NEXT: br %r14 - %and0 = and i32 %a0, %a1 - %not = xor i32 %a0, -1 - %and1 = and i32 %not, %a2 - %or = or i32 %and0, %and1 - store i32 %and0, ptr %p1 - ret i32 %or -} - -define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) { -; NO-MISC3-LABEL: masked_merge_no_transform1: -; NO-MISC3: # %bb.0: -; NO-MISC3-NEXT: nrk %r0, %r2, %r3 -; NO-MISC3-NEXT: xilf %r2, 4294967295 -; NO-MISC3-NEXT: nr %r4, %r2 -; NO-MISC3-NEXT: or %r0, %r4 -; NO-MISC3-NEXT: st %r2, 0(%r5) -; NO-MISC3-NEXT: lr %r2, %r0 -; NO-MISC3-NEXT: br %r14 -; -; MISC3-LABEL: masked_merge_no_transform1: -; MISC3: # %bb.0: -; MISC3-NEXT: nrk %r0, %r2, %r3 -; MISC3-NEXT: ncrk %r1, %r4, %r2 -; MISC3-NEXT: xilf %r2, 4294967295 -; MISC3-NEXT: or %r0, %r1 -; MISC3-NEXT: st %r2, 0(%r5) -; MISC3-NEXT: lr %r2, %r0 -; MISC3-NEXT: br %r14 - %and0 = and i32 %a0, %a1 - %not = xor i32 %a0, -1 - %and1 = and i32 %not, %a2 - %or = or i32 %and0, %and1 - store i32 %not, ptr %p1 - ret i32 %or -} - -define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) { -; NO-MISC3-LABEL: masked_merge_no_transform2: -; NO-MISC3: # %bb.0: -; NO-MISC3-NEXT: nr %r3, %r2 -; NO-MISC3-NEXT: xilf %r2, 4294967295 -; NO-MISC3-NEXT: nr %r4, %r2 -; NO-MISC3-NEXT: ork %r2, %r3, %r4 -; NO-MISC3-NEXT: st %r4, 0(%r5) -; NO-MISC3-NEXT: br %r14 -; -; MISC3-LABEL: masked_merge_no_transform2: -; MISC3: # %bb.0: -; MISC3-NEXT: nr %r3, %r2 -; MISC3-NEXT: ncrk %r0, %r4, %r2 -; MISC3-NEXT: ork %r2, %r3, %r0 -; MISC3-NEXT: st %r0, 0(%r5) -; MISC3-NEXT: br %r14 - %and0 = and i32 %a0, %a1 - %not = xor i32 %a0, -1 - %and1 = and i32 %not, %a2 - %or = or i32 %and0, %and1 - store i32 %and1, ptr %p1 - ret i32 %or -} diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll index e3607e12bf530..185c46aa5681e 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll @@ -4465,139 +4465,203 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) { ; NO-SIMD128-LABEL: bitselect_v16i8: ; NO-SIMD128: .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: i32.xor $push0=, $32, $48 -; NO-SIMD128-NEXT: i32.and $push1=, $pop0, $16 -; NO-SIMD128-NEXT: i32.xor $push2=, $pop1, $48 -; NO-SIMD128-NEXT: i32.store8 15($0), $pop2 -; NO-SIMD128-NEXT: i32.xor $push3=, $31, $47 -; NO-SIMD128-NEXT: i32.and $push4=, $pop3, $15 -; NO-SIMD128-NEXT: i32.xor $push5=, $pop4, $47 -; NO-SIMD128-NEXT: i32.store8 14($0), $pop5 -; NO-SIMD128-NEXT: i32.xor $push6=, $30, $46 -; NO-SIMD128-NEXT: i32.and $push7=, $pop6, $14 -; NO-SIMD128-NEXT: i32.xor $push8=, $pop7, $46 -; NO-SIMD128-NEXT: i32.store8 13($0), $pop8 -; NO-SIMD128-NEXT: i32.xor $push9=, $29, $45 -; NO-SIMD128-NEXT: i32.and $push10=, $pop9, $13 -; NO-SIMD128-NEXT: i32.xor $push11=, $pop10, $45 -; NO-SIMD128-NEXT: i32.store8 12($0), $pop11 -; NO-SIMD128-NEXT: i32.xor $push12=, $28, $44 -; NO-SIMD128-NEXT: i32.and $push13=, $pop12, $12 -; NO-SIMD128-NEXT: i32.xor $push14=, $pop13, $44 -; NO-SIMD128-NEXT: i32.store8 11($0), $pop14 -; NO-SIMD128-NEXT: i32.xor $push15=, $27, $43 -; NO-SIMD128-NEXT: i32.and $push16=, $pop15, $11 -; NO-SIMD128-NEXT: i32.xor $push17=, $pop16, $43 -; NO-SIMD128-NEXT: i32.store8 10($0), $pop17 -; NO-SIMD128-NEXT: i32.xor $push18=, $26, $42 -; NO-SIMD128-NEXT: i32.and $push19=, $pop18, $10 -; NO-SIMD128-NEXT: i32.xor $push20=, $pop19, $42 -; NO-SIMD128-NEXT: i32.store8 9($0), $pop20 -; NO-SIMD128-NEXT: i32.xor $push21=, $25, $41 -; NO-SIMD128-NEXT: i32.and $push22=, $pop21, $9 -; NO-SIMD128-NEXT: i32.xor $push23=, $pop22, $41 -; NO-SIMD128-NEXT: i32.store8 8($0), $pop23 -; NO-SIMD128-NEXT: i32.xor $push24=, $24, $40 -; NO-SIMD128-NEXT: i32.and $push25=, $pop24, $8 -; NO-SIMD128-NEXT: i32.xor $push26=, $pop25, $40 -; NO-SIMD128-NEXT: i32.store8 7($0), $pop26 -; NO-SIMD128-NEXT: i32.xor $push27=, $23, $39 -; NO-SIMD128-NEXT: i32.and $push28=, $pop27, $7 -; NO-SIMD128-NEXT: i32.xor $push29=, $pop28, $39 -; NO-SIMD128-NEXT: i32.store8 6($0), $pop29 -; NO-SIMD128-NEXT: i32.xor $push30=, $22, $38 -; NO-SIMD128-NEXT: i32.and $push31=, $pop30, $6 -; NO-SIMD128-NEXT: i32.xor $push32=, $pop31, $38 -; NO-SIMD128-NEXT: i32.store8 5($0), $pop32 -; NO-SIMD128-NEXT: i32.xor $push33=, $21, $37 -; NO-SIMD128-NEXT: i32.and $push34=, $pop33, $5 -; NO-SIMD128-NEXT: i32.xor $push35=, $pop34, $37 -; NO-SIMD128-NEXT: i32.store8 4($0), $pop35 -; NO-SIMD128-NEXT: i32.xor $push36=, $20, $36 -; NO-SIMD128-NEXT: i32.and $push37=, $pop36, $4 -; NO-SIMD128-NEXT: i32.xor $push38=, $pop37, $36 -; NO-SIMD128-NEXT: i32.store8 3($0), $pop38 -; NO-SIMD128-NEXT: i32.xor $push39=, $19, $35 -; NO-SIMD128-NEXT: i32.and $push40=, $pop39, $3 -; NO-SIMD128-NEXT: i32.xor $push41=, $pop40, $35 -; NO-SIMD128-NEXT: i32.store8 2($0), $pop41 -; NO-SIMD128-NEXT: i32.xor $push42=, $18, $34 -; NO-SIMD128-NEXT: i32.and $push43=, $pop42, $2 -; NO-SIMD128-NEXT: i32.xor $push44=, $pop43, $34 -; NO-SIMD128-NEXT: i32.store8 1($0), $pop44 -; NO-SIMD128-NEXT: i32.xor $push45=, $17, $33 -; NO-SIMD128-NEXT: i32.and $push46=, $pop45, $1 -; NO-SIMD128-NEXT: i32.xor $push47=, $pop46, $33 -; NO-SIMD128-NEXT: i32.store8 0($0), $pop47 +; NO-SIMD128-NEXT: i32.and $push0=, $16, $32 +; NO-SIMD128-NEXT: i32.const $push1=, -1 +; NO-SIMD128-NEXT: i32.xor $push2=, $16, $pop1 +; NO-SIMD128-NEXT: i32.and $push3=, $pop2, $48 +; NO-SIMD128-NEXT: i32.or $push4=, $pop0, $pop3 +; NO-SIMD128-NEXT: i32.store8 15($0), $pop4 +; NO-SIMD128-NEXT: i32.and $push5=, $15, $31 +; NO-SIMD128-NEXT: i32.const $push79=, -1 +; NO-SIMD128-NEXT: i32.xor $push6=, $15, $pop79 +; NO-SIMD128-NEXT: i32.and $push7=, $pop6, $47 +; NO-SIMD128-NEXT: i32.or $push8=, $pop5, $pop7 +; NO-SIMD128-NEXT: i32.store8 14($0), $pop8 +; NO-SIMD128-NEXT: i32.and $push9=, $14, $30 +; NO-SIMD128-NEXT: i32.const $push78=, -1 +; NO-SIMD128-NEXT: i32.xor $push10=, $14, $pop78 +; NO-SIMD128-NEXT: i32.and $push11=, $pop10, $46 +; NO-SIMD128-NEXT: i32.or $push12=, $pop9, $pop11 +; NO-SIMD128-NEXT: i32.store8 13($0), $pop12 +; NO-SIMD128-NEXT: i32.and $push13=, $13, $29 +; NO-SIMD128-NEXT: i32.const $push77=, -1 +; NO-SIMD128-NEXT: i32.xor $push14=, $13, $pop77 +; NO-SIMD128-NEXT: i32.and $push15=, $pop14, $45 +; NO-SIMD128-NEXT: i32.or $push16=, $pop13, $pop15 +; NO-SIMD128-NEXT: i32.store8 12($0), $pop16 +; NO-SIMD128-NEXT: i32.and $push17=, $12, $28 +; NO-SIMD128-NEXT: i32.const $push76=, -1 +; NO-SIMD128-NEXT: i32.xor $push18=, $12, $pop76 +; NO-SIMD128-NEXT: i32.and $push19=, $pop18, $44 +; NO-SIMD128-NEXT: i32.or $push20=, $pop17, $pop19 +; NO-SIMD128-NEXT: i32.store8 11($0), $pop20 +; NO-SIMD128-NEXT: i32.and $push21=, $11, $27 +; NO-SIMD128-NEXT: i32.const $push75=, -1 +; NO-SIMD128-NEXT: i32.xor $push22=, $11, $pop75 +; NO-SIMD128-NEXT: i32.and $push23=, $pop22, $43 +; NO-SIMD128-NEXT: i32.or $push24=, $pop21, $pop23 +; NO-SIMD128-NEXT: i32.store8 10($0), $pop24 +; NO-SIMD128-NEXT: i32.and $push25=, $10, $26 +; NO-SIMD128-NEXT: i32.const $push74=, -1 +; NO-SIMD128-NEXT: i32.xor $push26=, $10, $pop74 +; NO-SIMD128-NEXT: i32.and $push27=, $pop26, $42 +; NO-SIMD128-NEXT: i32.or $push28=, $pop25, $pop27 +; NO-SIMD128-NEXT: i32.store8 9($0), $pop28 +; NO-SIMD128-NEXT: i32.and $push29=, $9, $25 +; NO-SIMD128-NEXT: i32.const $push73=, -1 +; NO-SIMD128-NEXT: i32.xor $push30=, $9, $pop73 +; NO-SIMD128-NEXT: i32.and $push31=, $pop30, $41 +; NO-SIMD128-NEXT: i32.or $push32=, $pop29, $pop31 +; NO-SIMD128-NEXT: i32.store8 8($0), $pop32 +; NO-SIMD128-NEXT: i32.and $push33=, $8, $24 +; NO-SIMD128-NEXT: i32.const $push72=, -1 +; NO-SIMD128-NEXT: i32.xor $push34=, $8, $pop72 +; NO-SIMD128-NEXT: i32.and $push35=, $pop34, $40 +; NO-SIMD128-NEXT: i32.or $push36=, $pop33, $pop35 +; NO-SIMD128-NEXT: i32.store8 7($0), $pop36 +; NO-SIMD128-NEXT: i32.and $push37=, $7, $23 +; NO-SIMD128-NEXT: i32.const $push71=, -1 +; NO-SIMD128-NEXT: i32.xor $push38=, $7, $pop71 +; NO-SIMD128-NEXT: i32.and $push39=, $pop38, $39 +; NO-SIMD128-NEXT: i32.or $push40=, $pop37, $pop39 +; NO-SIMD128-NEXT: i32.store8 6($0), $pop40 +; NO-SIMD128-NEXT: i32.and $push41=, $6, $22 +; NO-SIMD128-NEXT: i32.const $push70=, -1 +; NO-SIMD128-NEXT: i32.xor $push42=, $6, $pop70 +; NO-SIMD128-NEXT: i32.and $push43=, $pop42, $38 +; NO-SIMD128-NEXT: i32.or $push44=, $pop41, $pop43 +; NO-SIMD128-NEXT: i32.store8 5($0), $pop44 +; NO-SIMD128-NEXT: i32.and $push45=, $5, $21 +; NO-SIMD128-NEXT: i32.const $push69=, -1 +; NO-SIMD128-NEXT: i32.xor $push46=, $5, $pop69 +; NO-SIMD128-NEXT: i32.and $push47=, $pop46, $37 +; NO-SIMD128-NEXT: i32.or $push48=, $pop45, $pop47 +; NO-SIMD128-NEXT: i32.store8 4($0), $pop48 +; NO-SIMD128-NEXT: i32.and $push49=, $4, $20 +; NO-SIMD128-NEXT: i32.const $push68=, -1 +; NO-SIMD128-NEXT: i32.xor $push50=, $4, $pop68 +; NO-SIMD128-NEXT: i32.and $push51=, $pop50, $36 +; NO-SIMD128-NEXT: i32.or $push52=, $pop49, $pop51 +; NO-SIMD128-NEXT: i32.store8 3($0), $pop52 +; NO-SIMD128-NEXT: i32.and $push53=, $3, $19 +; NO-SIMD128-NEXT: i32.const $push67=, -1 +; NO-SIMD128-NEXT: i32.xor $push54=, $3, $pop67 +; NO-SIMD128-NEXT: i32.and $push55=, $pop54, $35 +; NO-SIMD128-NEXT: i32.or $push56=, $pop53, $pop55 +; NO-SIMD128-NEXT: i32.store8 2($0), $pop56 +; NO-SIMD128-NEXT: i32.and $push57=, $2, $18 +; NO-SIMD128-NEXT: i32.const $push66=, -1 +; NO-SIMD128-NEXT: i32.xor $push58=, $2, $pop66 +; NO-SIMD128-NEXT: i32.and $push59=, $pop58, $34 +; NO-SIMD128-NEXT: i32.or $push60=, $pop57, $pop59 +; NO-SIMD128-NEXT: i32.store8 1($0), $pop60 +; NO-SIMD128-NEXT: i32.and $push61=, $1, $17 +; NO-SIMD128-NEXT: i32.const $push65=, -1 +; NO-SIMD128-NEXT: i32.xor $push62=, $1, $pop65 +; NO-SIMD128-NEXT: i32.and $push63=, $pop62, $33 +; NO-SIMD128-NEXT: i32.or $push64=, $pop61, $pop63 +; NO-SIMD128-NEXT: i32.store8 0($0), $pop64 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: bitselect_v16i8: ; NO-SIMD128-FAST: .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: i32.xor $push0=, $17, $33 -; NO-SIMD128-FAST-NEXT: i32.and $push1=, $pop0, $1 -; NO-SIMD128-FAST-NEXT: i32.xor $push2=, $pop1, $33 -; NO-SIMD128-FAST-NEXT: i32.store8 0($0), $pop2 -; NO-SIMD128-FAST-NEXT: i32.xor $push3=, $18, $34 -; NO-SIMD128-FAST-NEXT: i32.and $push4=, $pop3, $2 -; NO-SIMD128-FAST-NEXT: i32.xor $push5=, $pop4, $34 -; NO-SIMD128-FAST-NEXT: i32.store8 1($0), $pop5 -; NO-SIMD128-FAST-NEXT: i32.xor $push6=, $19, $35 -; NO-SIMD128-FAST-NEXT: i32.and $push7=, $pop6, $3 -; NO-SIMD128-FAST-NEXT: i32.xor $push8=, $pop7, $35 -; NO-SIMD128-FAST-NEXT: i32.store8 2($0), $pop8 -; NO-SIMD128-FAST-NEXT: i32.xor $push9=, $20, $36 -; NO-SIMD128-FAST-NEXT: i32.and $push10=, $pop9, $4 -; NO-SIMD128-FAST-NEXT: i32.xor $push11=, $pop10, $36 -; NO-SIMD128-FAST-NEXT: i32.store8 3($0), $pop11 -; NO-SIMD128-FAST-NEXT: i32.xor $push12=, $21, $37 -; NO-SIMD128-FAST-NEXT: i32.and $push13=, $pop12, $5 -; NO-SIMD128-FAST-NEXT: i32.xor $push14=, $pop13, $37 -; NO-SIMD128-FAST-NEXT: i32.store8 4($0), $pop14 -; NO-SIMD128-FAST-NEXT: i32.xor $push15=, $22, $38 -; NO-SIMD128-FAST-NEXT: i32.and $push16=, $pop15, $6 -; NO-SIMD128-FAST-NEXT: i32.xor $push17=, $pop16, $38 -; NO-SIMD128-FAST-NEXT: i32.store8 5($0), $pop17 -; NO-SIMD128-FAST-NEXT: i32.xor $push18=, $23, $39 -; NO-SIMD128-FAST-NEXT: i32.and $push19=, $pop18, $7 -; NO-SIMD128-FAST-NEXT: i32.xor $push20=, $pop19, $39 -; NO-SIMD128-FAST-NEXT: i32.store8 6($0), $pop20 -; NO-SIMD128-FAST-NEXT: i32.xor $push21=, $24, $40 -; NO-SIMD128-FAST-NEXT: i32.and $push22=, $pop21, $8 -; NO-SIMD128-FAST-NEXT: i32.xor $push23=, $pop22, $40 -; NO-SIMD128-FAST-NEXT: i32.store8 7($0), $pop23 -; NO-SIMD128-FAST-NEXT: i32.xor $push24=, $25, $41 -; NO-SIMD128-FAST-NEXT: i32.and $push25=, $pop24, $9 -; NO-SIMD128-FAST-NEXT: i32.xor $push26=, $pop25, $41 -; NO-SIMD128-FAST-NEXT: i32.store8 8($0), $pop26 -; NO-SIMD128-FAST-NEXT: i32.xor $push27=, $26, $42 -; NO-SIMD128-FAST-NEXT: i32.and $push28=, $pop27, $10 -; NO-SIMD128-FAST-NEXT: i32.xor $push29=, $pop28, $42 -; NO-SIMD128-FAST-NEXT: i32.store8 9($0), $pop29 -; NO-SIMD128-FAST-NEXT: i32.xor $push30=, $27, $43 -; NO-SIMD128-FAST-NEXT: i32.and $push31=, $pop30, $11 -; NO-SIMD128-FAST-NEXT: i32.xor $push32=, $pop31, $43 -; NO-SIMD128-FAST-NEXT: i32.store8 10($0), $pop32 -; NO-SIMD128-FAST-NEXT: i32.xor $push33=, $28, $44 -; NO-SIMD128-FAST-NEXT: i32.and $push34=, $pop33, $12 -; NO-SIMD128-FAST-NEXT: i32.xor $push35=, $pop34, $44 -; NO-SIMD128-FAST-NEXT: i32.store8 11($0), $pop35 -; NO-SIMD128-FAST-NEXT: i32.xor $push36=, $29, $45 -; NO-SIMD128-FAST-NEXT: i32.and $push37=, $pop36, $13 -; NO-SIMD128-FAST-NEXT: i32.xor $push38=, $pop37, $45 -; NO-SIMD128-FAST-NEXT: i32.store8 12($0), $pop38 -; NO-SIMD128-FAST-NEXT: i32.xor $push39=, $30, $46 -; NO-SIMD128-FAST-NEXT: i32.and $push40=, $pop39, $14 -; NO-SIMD128-FAST-NEXT: i32.xor $push41=, $pop40, $46 -; NO-SIMD128-FAST-NEXT: i32.store8 13($0), $pop41 -; NO-SIMD128-FAST-NEXT: i32.xor $push42=, $31, $47 -; NO-SIMD128-FAST-NEXT: i32.and $push43=, $pop42, $15 -; NO-SIMD128-FAST-NEXT: i32.xor $push44=, $pop43, $47 -; NO-SIMD128-FAST-NEXT: i32.store8 14($0), $pop44 -; NO-SIMD128-FAST-NEXT: i32.xor $push45=, $32, $48 -; NO-SIMD128-FAST-NEXT: i32.and $push46=, $pop45, $16 -; NO-SIMD128-FAST-NEXT: i32.xor $push47=, $pop46, $48 -; NO-SIMD128-FAST-NEXT: i32.store8 15($0), $pop47 +; NO-SIMD128-FAST-NEXT: i32.and $push0=, $1, $17 +; NO-SIMD128-FAST-NEXT: i32.const $push1=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push2=, $1, $pop1 +; NO-SIMD128-FAST-NEXT: i32.and $push3=, $pop2, $33 +; NO-SIMD128-FAST-NEXT: i32.or $push4=, $pop0, $pop3 +; NO-SIMD128-FAST-NEXT: i32.store8 0($0), $pop4 +; NO-SIMD128-FAST-NEXT: i32.and $push5=, $2, $18 +; NO-SIMD128-FAST-NEXT: i32.const $push79=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push6=, $2, $pop79 +; NO-SIMD128-FAST-NEXT: i32.and $push7=, $pop6, $34 +; NO-SIMD128-FAST-NEXT: i32.or $push8=, $pop5, $pop7 +; NO-SIMD128-FAST-NEXT: i32.store8 1($0), $pop8 +; NO-SIMD128-FAST-NEXT: i32.and $push9=, $3, $19 +; NO-SIMD128-FAST-NEXT: i32.const $push78=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push10=, $3, $pop78 +; NO-SIMD128-FAST-NEXT: i32.and $push11=, $pop10, $35 +; NO-SIMD128-FAST-NEXT: i32.or $push12=, $pop9, $pop11 +; NO-SIMD128-FAST-NEXT: i32.store8 2($0), $pop12 +; NO-SIMD128-FAST-NEXT: i32.and $push13=, $4, $20 +; NO-SIMD128-FAST-NEXT: i32.const $push77=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push14=, $4, $pop77 +; NO-SIMD128-FAST-NEXT: i32.and $push15=, $pop14, $36 +; NO-SIMD128-FAST-NEXT: i32.or $push16=, $pop13, $pop15 +; NO-SIMD128-FAST-NEXT: i32.store8 3($0), $pop16 +; NO-SIMD128-FAST-NEXT: i32.and $push17=, $5, $21 +; NO-SIMD128-FAST-NEXT: i32.const $push76=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push18=, $5, $pop76 +; NO-SIMD128-FAST-NEXT: i32.and $push19=, $pop18, $37 +; NO-SIMD128-FAST-NEXT: i32.or $push20=, $pop17, $pop19 +; NO-SIMD128-FAST-NEXT: i32.store8 4($0), $pop20 +; NO-SIMD128-FAST-NEXT: i32.and $push21=, $6, $22 +; NO-SIMD128-FAST-NEXT: i32.const $push75=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push22=, $6, $pop75 +; NO-SIMD128-FAST-NEXT: i32.and $push23=, $pop22, $38 +; NO-SIMD128-FAST-NEXT: i32.or $push24=, $pop21, $pop23 +; NO-SIMD128-FAST-NEXT: i32.store8 5($0), $pop24 +; NO-SIMD128-FAST-NEXT: i32.and $push25=, $7, $23 +; NO-SIMD128-FAST-NEXT: i32.const $push74=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push26=, $7, $pop74 +; NO-SIMD128-FAST-NEXT: i32.and $push27=, $pop26, $39 +; NO-SIMD128-FAST-NEXT: i32.or $push28=, $pop25, $pop27 +; NO-SIMD128-FAST-NEXT: i32.store8 6($0), $pop28 +; NO-SIMD128-FAST-NEXT: i32.and $push29=, $8, $24 +; NO-SIMD128-FAST-NEXT: i32.const $push73=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push30=, $8, $pop73 +; NO-SIMD128-FAST-NEXT: i32.and $push31=, $pop30, $40 +; NO-SIMD128-FAST-NEXT: i32.or $push32=, $pop29, $pop31 +; NO-SIMD128-FAST-NEXT: i32.store8 7($0), $pop32 +; NO-SIMD128-FAST-NEXT: i32.and $push33=, $9, $25 +; NO-SIMD128-FAST-NEXT: i32.const $push72=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push34=, $9, $pop72 +; NO-SIMD128-FAST-NEXT: i32.and $push35=, $pop34, $41 +; NO-SIMD128-FAST-NEXT: i32.or $push36=, $pop33, $pop35 +; NO-SIMD128-FAST-NEXT: i32.store8 8($0), $pop36 +; NO-SIMD128-FAST-NEXT: i32.and $push37=, $10, $26 +; NO-SIMD128-FAST-NEXT: i32.const $push71=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push38=, $10, $pop71 +; NO-SIMD128-FAST-NEXT: i32.and $push39=, $pop38, $42 +; NO-SIMD128-FAST-NEXT: i32.or $push40=, $pop37, $pop39 +; NO-SIMD128-FAST-NEXT: i32.store8 9($0), $pop40 +; NO-SIMD128-FAST-NEXT: i32.and $push41=, $11, $27 +; NO-SIMD128-FAST-NEXT: i32.const $push70=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push42=, $11, $pop70 +; NO-SIMD128-FAST-NEXT: i32.and $push43=, $pop42, $43 +; NO-SIMD128-FAST-NEXT: i32.or $push44=, $pop41, $pop43 +; NO-SIMD128-FAST-NEXT: i32.store8 10($0), $pop44 +; NO-SIMD128-FAST-NEXT: i32.and $push45=, $12, $28 +; NO-SIMD128-FAST-NEXT: i32.const $push69=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push46=, $12, $pop69 +; NO-SIMD128-FAST-NEXT: i32.and $push47=, $pop46, $44 +; NO-SIMD128-FAST-NEXT: i32.or $push48=, $pop45, $pop47 +; NO-SIMD128-FAST-NEXT: i32.store8 11($0), $pop48 +; NO-SIMD128-FAST-NEXT: i32.and $push49=, $13, $29 +; NO-SIMD128-FAST-NEXT: i32.const $push68=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push50=, $13, $pop68 +; NO-SIMD128-FAST-NEXT: i32.and $push51=, $pop50, $45 +; NO-SIMD128-FAST-NEXT: i32.or $push52=, $pop49, $pop51 +; NO-SIMD128-FAST-NEXT: i32.store8 12($0), $pop52 +; NO-SIMD128-FAST-NEXT: i32.and $push53=, $14, $30 +; NO-SIMD128-FAST-NEXT: i32.const $push67=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push54=, $14, $pop67 +; NO-SIMD128-FAST-NEXT: i32.and $push55=, $pop54, $46 +; NO-SIMD128-FAST-NEXT: i32.or $push56=, $pop53, $pop55 +; NO-SIMD128-FAST-NEXT: i32.store8 13($0), $pop56 +; NO-SIMD128-FAST-NEXT: i32.and $push57=, $15, $31 +; NO-SIMD128-FAST-NEXT: i32.const $push66=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push58=, $15, $pop66 +; NO-SIMD128-FAST-NEXT: i32.and $push59=, $pop58, $47 +; NO-SIMD128-FAST-NEXT: i32.or $push60=, $pop57, $pop59 +; NO-SIMD128-FAST-NEXT: i32.store8 14($0), $pop60 +; NO-SIMD128-FAST-NEXT: i32.and $push61=, $16, $32 +; NO-SIMD128-FAST-NEXT: i32.const $push65=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push62=, $16, $pop65 +; NO-SIMD128-FAST-NEXT: i32.and $push63=, $pop62, $48 +; NO-SIMD128-FAST-NEXT: i32.or $push64=, $pop61, $pop63 +; NO-SIMD128-FAST-NEXT: i32.store8 15($0), $pop64 ; NO-SIMD128-FAST-NEXT: return %masked_v1 = and <16 x i8> %c, %v1 %inv_mask = xor <16 x i8> %c, @@ -7482,75 +7546,107 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) { ; NO-SIMD128-LABEL: bitselect_v8i16: ; NO-SIMD128: .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: i32.xor $push0=, $16, $24 -; NO-SIMD128-NEXT: i32.and $push1=, $pop0, $8 -; NO-SIMD128-NEXT: i32.xor $push2=, $pop1, $24 -; NO-SIMD128-NEXT: i32.store16 14($0), $pop2 -; NO-SIMD128-NEXT: i32.xor $push3=, $15, $23 -; NO-SIMD128-NEXT: i32.and $push4=, $pop3, $7 -; NO-SIMD128-NEXT: i32.xor $push5=, $pop4, $23 -; NO-SIMD128-NEXT: i32.store16 12($0), $pop5 -; NO-SIMD128-NEXT: i32.xor $push6=, $14, $22 -; NO-SIMD128-NEXT: i32.and $push7=, $pop6, $6 -; NO-SIMD128-NEXT: i32.xor $push8=, $pop7, $22 -; NO-SIMD128-NEXT: i32.store16 10($0), $pop8 -; NO-SIMD128-NEXT: i32.xor $push9=, $13, $21 -; NO-SIMD128-NEXT: i32.and $push10=, $pop9, $5 -; NO-SIMD128-NEXT: i32.xor $push11=, $pop10, $21 -; NO-SIMD128-NEXT: i32.store16 8($0), $pop11 -; NO-SIMD128-NEXT: i32.xor $push12=, $12, $20 -; NO-SIMD128-NEXT: i32.and $push13=, $pop12, $4 -; NO-SIMD128-NEXT: i32.xor $push14=, $pop13, $20 -; NO-SIMD128-NEXT: i32.store16 6($0), $pop14 -; NO-SIMD128-NEXT: i32.xor $push15=, $11, $19 -; NO-SIMD128-NEXT: i32.and $push16=, $pop15, $3 -; NO-SIMD128-NEXT: i32.xor $push17=, $pop16, $19 -; NO-SIMD128-NEXT: i32.store16 4($0), $pop17 -; NO-SIMD128-NEXT: i32.xor $push18=, $10, $18 -; NO-SIMD128-NEXT: i32.and $push19=, $pop18, $2 -; NO-SIMD128-NEXT: i32.xor $push20=, $pop19, $18 -; NO-SIMD128-NEXT: i32.store16 2($0), $pop20 -; NO-SIMD128-NEXT: i32.xor $push21=, $9, $17 -; NO-SIMD128-NEXT: i32.and $push22=, $pop21, $1 -; NO-SIMD128-NEXT: i32.xor $push23=, $pop22, $17 -; NO-SIMD128-NEXT: i32.store16 0($0), $pop23 +; NO-SIMD128-NEXT: i32.and $push0=, $16, $8 +; NO-SIMD128-NEXT: i32.const $push1=, -1 +; NO-SIMD128-NEXT: i32.xor $push2=, $8, $pop1 +; NO-SIMD128-NEXT: i32.and $push3=, $24, $pop2 +; NO-SIMD128-NEXT: i32.or $push4=, $pop0, $pop3 +; NO-SIMD128-NEXT: i32.store16 14($0), $pop4 +; NO-SIMD128-NEXT: i32.and $push5=, $15, $7 +; NO-SIMD128-NEXT: i32.const $push39=, -1 +; NO-SIMD128-NEXT: i32.xor $push6=, $7, $pop39 +; NO-SIMD128-NEXT: i32.and $push7=, $23, $pop6 +; NO-SIMD128-NEXT: i32.or $push8=, $pop5, $pop7 +; NO-SIMD128-NEXT: i32.store16 12($0), $pop8 +; NO-SIMD128-NEXT: i32.and $push9=, $14, $6 +; NO-SIMD128-NEXT: i32.const $push38=, -1 +; NO-SIMD128-NEXT: i32.xor $push10=, $6, $pop38 +; NO-SIMD128-NEXT: i32.and $push11=, $22, $pop10 +; NO-SIMD128-NEXT: i32.or $push12=, $pop9, $pop11 +; NO-SIMD128-NEXT: i32.store16 10($0), $pop12 +; NO-SIMD128-NEXT: i32.and $push13=, $13, $5 +; NO-SIMD128-NEXT: i32.const $push37=, -1 +; NO-SIMD128-NEXT: i32.xor $push14=, $5, $pop37 +; NO-SIMD128-NEXT: i32.and $push15=, $21, $pop14 +; NO-SIMD128-NEXT: i32.or $push16=, $pop13, $pop15 +; NO-SIMD128-NEXT: i32.store16 8($0), $pop16 +; NO-SIMD128-NEXT: i32.and $push17=, $12, $4 +; NO-SIMD128-NEXT: i32.const $push36=, -1 +; NO-SIMD128-NEXT: i32.xor $push18=, $4, $pop36 +; NO-SIMD128-NEXT: i32.and $push19=, $20, $pop18 +; NO-SIMD128-NEXT: i32.or $push20=, $pop17, $pop19 +; NO-SIMD128-NEXT: i32.store16 6($0), $pop20 +; NO-SIMD128-NEXT: i32.and $push21=, $11, $3 +; NO-SIMD128-NEXT: i32.const $push35=, -1 +; NO-SIMD128-NEXT: i32.xor $push22=, $3, $pop35 +; NO-SIMD128-NEXT: i32.and $push23=, $19, $pop22 +; NO-SIMD128-NEXT: i32.or $push24=, $pop21, $pop23 +; NO-SIMD128-NEXT: i32.store16 4($0), $pop24 +; NO-SIMD128-NEXT: i32.and $push25=, $10, $2 +; NO-SIMD128-NEXT: i32.const $push34=, -1 +; NO-SIMD128-NEXT: i32.xor $push26=, $2, $pop34 +; NO-SIMD128-NEXT: i32.and $push27=, $18, $pop26 +; NO-SIMD128-NEXT: i32.or $push28=, $pop25, $pop27 +; NO-SIMD128-NEXT: i32.store16 2($0), $pop28 +; NO-SIMD128-NEXT: i32.and $push29=, $9, $1 +; NO-SIMD128-NEXT: i32.const $push33=, -1 +; NO-SIMD128-NEXT: i32.xor $push30=, $1, $pop33 +; NO-SIMD128-NEXT: i32.and $push31=, $17, $pop30 +; NO-SIMD128-NEXT: i32.or $push32=, $pop29, $pop31 +; NO-SIMD128-NEXT: i32.store16 0($0), $pop32 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: bitselect_v8i16: ; NO-SIMD128-FAST: .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: i32.xor $push0=, $9, $17 -; NO-SIMD128-FAST-NEXT: i32.and $push1=, $pop0, $1 -; NO-SIMD128-FAST-NEXT: i32.xor $push2=, $pop1, $17 -; NO-SIMD128-FAST-NEXT: i32.store16 0($0), $pop2 -; NO-SIMD128-FAST-NEXT: i32.xor $push3=, $10, $18 -; NO-SIMD128-FAST-NEXT: i32.and $push4=, $pop3, $2 -; NO-SIMD128-FAST-NEXT: i32.xor $push5=, $pop4, $18 -; NO-SIMD128-FAST-NEXT: i32.store16 2($0), $pop5 -; NO-SIMD128-FAST-NEXT: i32.xor $push6=, $11, $19 -; NO-SIMD128-FAST-NEXT: i32.and $push7=, $pop6, $3 -; NO-SIMD128-FAST-NEXT: i32.xor $push8=, $pop7, $19 -; NO-SIMD128-FAST-NEXT: i32.store16 4($0), $pop8 -; NO-SIMD128-FAST-NEXT: i32.xor $push9=, $12, $20 -; NO-SIMD128-FAST-NEXT: i32.and $push10=, $pop9, $4 -; NO-SIMD128-FAST-NEXT: i32.xor $push11=, $pop10, $20 -; NO-SIMD128-FAST-NEXT: i32.store16 6($0), $pop11 -; NO-SIMD128-FAST-NEXT: i32.xor $push12=, $13, $21 -; NO-SIMD128-FAST-NEXT: i32.and $push13=, $pop12, $5 -; NO-SIMD128-FAST-NEXT: i32.xor $push14=, $pop13, $21 -; NO-SIMD128-FAST-NEXT: i32.store16 8($0), $pop14 -; NO-SIMD128-FAST-NEXT: i32.xor $push15=, $14, $22 -; NO-SIMD128-FAST-NEXT: i32.and $push16=, $pop15, $6 -; NO-SIMD128-FAST-NEXT: i32.xor $push17=, $pop16, $22 -; NO-SIMD128-FAST-NEXT: i32.store16 10($0), $pop17 -; NO-SIMD128-FAST-NEXT: i32.xor $push18=, $15, $23 -; NO-SIMD128-FAST-NEXT: i32.and $push19=, $pop18, $7 -; NO-SIMD128-FAST-NEXT: i32.xor $push20=, $pop19, $23 -; NO-SIMD128-FAST-NEXT: i32.store16 12($0), $pop20 -; NO-SIMD128-FAST-NEXT: i32.xor $push21=, $16, $24 -; NO-SIMD128-FAST-NEXT: i32.and $push22=, $pop21, $8 -; NO-SIMD128-FAST-NEXT: i32.xor $push23=, $pop22, $24 -; NO-SIMD128-FAST-NEXT: i32.store16 14($0), $pop23 +; NO-SIMD128-FAST-NEXT: i32.and $push0=, $9, $1 +; NO-SIMD128-FAST-NEXT: i32.const $push1=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push2=, $1, $pop1 +; NO-SIMD128-FAST-NEXT: i32.and $push3=, $17, $pop2 +; NO-SIMD128-FAST-NEXT: i32.or $push4=, $pop0, $pop3 +; NO-SIMD128-FAST-NEXT: i32.store16 0($0), $pop4 +; NO-SIMD128-FAST-NEXT: i32.and $push5=, $10, $2 +; NO-SIMD128-FAST-NEXT: i32.const $push39=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push6=, $2, $pop39 +; NO-SIMD128-FAST-NEXT: i32.and $push7=, $18, $pop6 +; NO-SIMD128-FAST-NEXT: i32.or $push8=, $pop5, $pop7 +; NO-SIMD128-FAST-NEXT: i32.store16 2($0), $pop8 +; NO-SIMD128-FAST-NEXT: i32.and $push9=, $11, $3 +; NO-SIMD128-FAST-NEXT: i32.const $push38=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push10=, $3, $pop38 +; NO-SIMD128-FAST-NEXT: i32.and $push11=, $19, $pop10 +; NO-SIMD128-FAST-NEXT: i32.or $push12=, $pop9, $pop11 +; NO-SIMD128-FAST-NEXT: i32.store16 4($0), $pop12 +; NO-SIMD128-FAST-NEXT: i32.and $push13=, $12, $4 +; NO-SIMD128-FAST-NEXT: i32.const $push37=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push14=, $4, $pop37 +; NO-SIMD128-FAST-NEXT: i32.and $push15=, $20, $pop14 +; NO-SIMD128-FAST-NEXT: i32.or $push16=, $pop13, $pop15 +; NO-SIMD128-FAST-NEXT: i32.store16 6($0), $pop16 +; NO-SIMD128-FAST-NEXT: i32.and $push17=, $13, $5 +; NO-SIMD128-FAST-NEXT: i32.const $push36=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push18=, $5, $pop36 +; NO-SIMD128-FAST-NEXT: i32.and $push19=, $21, $pop18 +; NO-SIMD128-FAST-NEXT: i32.or $push20=, $pop17, $pop19 +; NO-SIMD128-FAST-NEXT: i32.store16 8($0), $pop20 +; NO-SIMD128-FAST-NEXT: i32.and $push21=, $14, $6 +; NO-SIMD128-FAST-NEXT: i32.const $push35=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push22=, $6, $pop35 +; NO-SIMD128-FAST-NEXT: i32.and $push23=, $22, $pop22 +; NO-SIMD128-FAST-NEXT: i32.or $push24=, $pop21, $pop23 +; NO-SIMD128-FAST-NEXT: i32.store16 10($0), $pop24 +; NO-SIMD128-FAST-NEXT: i32.and $push25=, $15, $7 +; NO-SIMD128-FAST-NEXT: i32.const $push34=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push26=, $7, $pop34 +; NO-SIMD128-FAST-NEXT: i32.and $push27=, $23, $pop26 +; NO-SIMD128-FAST-NEXT: i32.or $push28=, $pop25, $pop27 +; NO-SIMD128-FAST-NEXT: i32.store16 12($0), $pop28 +; NO-SIMD128-FAST-NEXT: i32.and $push29=, $16, $8 +; NO-SIMD128-FAST-NEXT: i32.const $push33=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push30=, $8, $pop33 +; NO-SIMD128-FAST-NEXT: i32.and $push31=, $24, $pop30 +; NO-SIMD128-FAST-NEXT: i32.or $push32=, $pop29, $pop31 +; NO-SIMD128-FAST-NEXT: i32.store16 14($0), $pop32 ; NO-SIMD128-FAST-NEXT: return %masked_v1 = and <8 x i16> %v1, %c %inv_mask = xor <8 x i16> @@ -9357,43 +9453,59 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) { ; NO-SIMD128-LABEL: bitselect_v4i32: ; NO-SIMD128: .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: i32.xor $push0=, $8, $12 -; NO-SIMD128-NEXT: i32.and $push1=, $pop0, $4 -; NO-SIMD128-NEXT: i32.xor $push2=, $pop1, $12 -; NO-SIMD128-NEXT: i32.store 12($0), $pop2 -; NO-SIMD128-NEXT: i32.xor $push3=, $7, $11 -; NO-SIMD128-NEXT: i32.and $push4=, $pop3, $3 -; NO-SIMD128-NEXT: i32.xor $push5=, $pop4, $11 -; NO-SIMD128-NEXT: i32.store 8($0), $pop5 -; NO-SIMD128-NEXT: i32.xor $push6=, $6, $10 -; NO-SIMD128-NEXT: i32.and $push7=, $pop6, $2 -; NO-SIMD128-NEXT: i32.xor $push8=, $pop7, $10 -; NO-SIMD128-NEXT: i32.store 4($0), $pop8 -; NO-SIMD128-NEXT: i32.xor $push9=, $5, $9 -; NO-SIMD128-NEXT: i32.and $push10=, $pop9, $1 -; NO-SIMD128-NEXT: i32.xor $push11=, $pop10, $9 -; NO-SIMD128-NEXT: i32.store 0($0), $pop11 +; NO-SIMD128-NEXT: i32.const $push1=, -1 +; NO-SIMD128-NEXT: i32.xor $push2=, $4, $pop1 +; NO-SIMD128-NEXT: i32.and $push3=, $pop2, $12 +; NO-SIMD128-NEXT: i32.and $push0=, $4, $8 +; NO-SIMD128-NEXT: i32.or $push4=, $pop3, $pop0 +; NO-SIMD128-NEXT: i32.store 12($0), $pop4 +; NO-SIMD128-NEXT: i32.const $push19=, -1 +; NO-SIMD128-NEXT: i32.xor $push6=, $3, $pop19 +; NO-SIMD128-NEXT: i32.and $push7=, $pop6, $11 +; NO-SIMD128-NEXT: i32.and $push5=, $3, $7 +; NO-SIMD128-NEXT: i32.or $push8=, $pop7, $pop5 +; NO-SIMD128-NEXT: i32.store 8($0), $pop8 +; NO-SIMD128-NEXT: i32.const $push18=, -1 +; NO-SIMD128-NEXT: i32.xor $push10=, $2, $pop18 +; NO-SIMD128-NEXT: i32.and $push11=, $pop10, $10 +; NO-SIMD128-NEXT: i32.and $push9=, $2, $6 +; NO-SIMD128-NEXT: i32.or $push12=, $pop11, $pop9 +; NO-SIMD128-NEXT: i32.store 4($0), $pop12 +; NO-SIMD128-NEXT: i32.const $push17=, -1 +; NO-SIMD128-NEXT: i32.xor $push14=, $1, $pop17 +; NO-SIMD128-NEXT: i32.and $push15=, $pop14, $9 +; NO-SIMD128-NEXT: i32.and $push13=, $1, $5 +; NO-SIMD128-NEXT: i32.or $push16=, $pop15, $pop13 +; NO-SIMD128-NEXT: i32.store 0($0), $pop16 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: bitselect_v4i32: ; NO-SIMD128-FAST: .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: i32.xor $push0=, $5, $9 -; NO-SIMD128-FAST-NEXT: i32.and $push1=, $pop0, $1 -; NO-SIMD128-FAST-NEXT: i32.xor $push2=, $pop1, $9 -; NO-SIMD128-FAST-NEXT: i32.store 0($0), $pop2 -; NO-SIMD128-FAST-NEXT: i32.xor $push3=, $6, $10 -; NO-SIMD128-FAST-NEXT: i32.and $push4=, $pop3, $2 -; NO-SIMD128-FAST-NEXT: i32.xor $push5=, $pop4, $10 -; NO-SIMD128-FAST-NEXT: i32.store 4($0), $pop5 -; NO-SIMD128-FAST-NEXT: i32.xor $push6=, $7, $11 -; NO-SIMD128-FAST-NEXT: i32.and $push7=, $pop6, $3 -; NO-SIMD128-FAST-NEXT: i32.xor $push8=, $pop7, $11 -; NO-SIMD128-FAST-NEXT: i32.store 8($0), $pop8 -; NO-SIMD128-FAST-NEXT: i32.xor $push9=, $8, $12 -; NO-SIMD128-FAST-NEXT: i32.and $push10=, $pop9, $4 -; NO-SIMD128-FAST-NEXT: i32.xor $push11=, $pop10, $12 -; NO-SIMD128-FAST-NEXT: i32.store 12($0), $pop11 +; NO-SIMD128-FAST-NEXT: i32.const $push1=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push2=, $1, $pop1 +; NO-SIMD128-FAST-NEXT: i32.and $push3=, $pop2, $9 +; NO-SIMD128-FAST-NEXT: i32.and $push0=, $1, $5 +; NO-SIMD128-FAST-NEXT: i32.or $push4=, $pop3, $pop0 +; NO-SIMD128-FAST-NEXT: i32.store 0($0), $pop4 +; NO-SIMD128-FAST-NEXT: i32.const $push19=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push6=, $2, $pop19 +; NO-SIMD128-FAST-NEXT: i32.and $push7=, $pop6, $10 +; NO-SIMD128-FAST-NEXT: i32.and $push5=, $2, $6 +; NO-SIMD128-FAST-NEXT: i32.or $push8=, $pop7, $pop5 +; NO-SIMD128-FAST-NEXT: i32.store 4($0), $pop8 +; NO-SIMD128-FAST-NEXT: i32.const $push18=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push10=, $3, $pop18 +; NO-SIMD128-FAST-NEXT: i32.and $push11=, $pop10, $11 +; NO-SIMD128-FAST-NEXT: i32.and $push9=, $3, $7 +; NO-SIMD128-FAST-NEXT: i32.or $push12=, $pop11, $pop9 +; NO-SIMD128-FAST-NEXT: i32.store 8($0), $pop12 +; NO-SIMD128-FAST-NEXT: i32.const $push17=, -1 +; NO-SIMD128-FAST-NEXT: i32.xor $push14=, $4, $pop17 +; NO-SIMD128-FAST-NEXT: i32.and $push15=, $pop14, $12 +; NO-SIMD128-FAST-NEXT: i32.and $push13=, $4, $8 +; NO-SIMD128-FAST-NEXT: i32.or $push16=, $pop15, $pop13 +; NO-SIMD128-FAST-NEXT: i32.store 12($0), $pop16 ; NO-SIMD128-FAST-NEXT: return %masked_v1 = and <4 x i32> %c, %v1 %inv_mask = xor <4 x i32> , %c @@ -10862,27 +10974,35 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) { ; NO-SIMD128-LABEL: bitselect_v2i64: ; NO-SIMD128: .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: i64.xor $push0=, $4, $6 -; NO-SIMD128-NEXT: i64.and $push1=, $pop0, $2 -; NO-SIMD128-NEXT: i64.xor $push2=, $pop1, $6 -; NO-SIMD128-NEXT: i64.store 8($0), $pop2 -; NO-SIMD128-NEXT: i64.xor $push3=, $3, $5 -; NO-SIMD128-NEXT: i64.and $push4=, $pop3, $1 -; NO-SIMD128-NEXT: i64.xor $push5=, $pop4, $5 -; NO-SIMD128-NEXT: i64.store 0($0), $pop5 +; NO-SIMD128-NEXT: i64.const $push1=, -1 +; NO-SIMD128-NEXT: i64.xor $push2=, $2, $pop1 +; NO-SIMD128-NEXT: i64.and $push3=, $6, $pop2 +; NO-SIMD128-NEXT: i64.and $push0=, $4, $2 +; NO-SIMD128-NEXT: i64.or $push4=, $pop3, $pop0 +; NO-SIMD128-NEXT: i64.store 8($0), $pop4 +; NO-SIMD128-NEXT: i64.const $push9=, -1 +; NO-SIMD128-NEXT: i64.xor $push6=, $1, $pop9 +; NO-SIMD128-NEXT: i64.and $push7=, $5, $pop6 +; NO-SIMD128-NEXT: i64.and $push5=, $3, $1 +; NO-SIMD128-NEXT: i64.or $push8=, $pop7, $pop5 +; NO-SIMD128-NEXT: i64.store 0($0), $pop8 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: bitselect_v2i64: ; NO-SIMD128-FAST: .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: i64.xor $push0=, $3, $5 -; NO-SIMD128-FAST-NEXT: i64.and $push1=, $pop0, $1 -; NO-SIMD128-FAST-NEXT: i64.xor $push2=, $pop1, $5 -; NO-SIMD128-FAST-NEXT: i64.store 0($0), $pop2 -; NO-SIMD128-FAST-NEXT: i64.xor $push3=, $4, $6 -; NO-SIMD128-FAST-NEXT: i64.and $push4=, $pop3, $2 -; NO-SIMD128-FAST-NEXT: i64.xor $push5=, $pop4, $6 -; NO-SIMD128-FAST-NEXT: i64.store 8($0), $pop5 +; NO-SIMD128-FAST-NEXT: i64.const $push1=, -1 +; NO-SIMD128-FAST-NEXT: i64.xor $push2=, $1, $pop1 +; NO-SIMD128-FAST-NEXT: i64.and $push3=, $5, $pop2 +; NO-SIMD128-FAST-NEXT: i64.and $push0=, $3, $1 +; NO-SIMD128-FAST-NEXT: i64.or $push4=, $pop3, $pop0 +; NO-SIMD128-FAST-NEXT: i64.store 0($0), $pop4 +; NO-SIMD128-FAST-NEXT: i64.const $push9=, -1 +; NO-SIMD128-FAST-NEXT: i64.xor $push6=, $2, $pop9 +; NO-SIMD128-FAST-NEXT: i64.and $push7=, $6, $pop6 +; NO-SIMD128-FAST-NEXT: i64.and $push5=, $4, $2 +; NO-SIMD128-FAST-NEXT: i64.or $push8=, $pop7, $pop5 +; NO-SIMD128-FAST-NEXT: i64.store 8($0), $pop8 ; NO-SIMD128-FAST-NEXT: return %masked_v1 = and <2 x i64> %v1, %c %inv_mask = xor <2 x i64> , %c diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll index 4fc0827ac4dd6..2922113b14ea9 100644 --- a/llvm/test/CodeGen/X86/bitselect.ll +++ b/llvm/test/CodeGen/X86/bitselect.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64-NOBMI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64-BMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64,X64-NOBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI ; PR46472 ; bitselect(a,b,m) == or(and(a,not(m)),and(b,m)) @@ -17,22 +17,14 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind { ; X86-NEXT: xorb %cl, %al ; X86-NEXT: retl ; -; X64-NOBMI-LABEL: bitselect_i8: -; X64-NOBMI: # %bb.0: -; X64-NOBMI-NEXT: movl %esi, %eax -; X64-NOBMI-NEXT: xorl %edi, %eax -; X64-NOBMI-NEXT: andl %edx, %eax -; X64-NOBMI-NEXT: xorl %edi, %eax -; X64-NOBMI-NEXT: # kill: def $al killed $al killed $eax -; X64-NOBMI-NEXT: retq -; -; X64-BMI-LABEL: bitselect_i8: -; X64-BMI: # %bb.0: -; X64-BMI-NEXT: andnl %edi, %edx, %eax -; X64-BMI-NEXT: andl %edx, %esi -; X64-BMI-NEXT: orl %esi, %eax -; X64-BMI-NEXT: # kill: def $al killed $al killed $eax -; X64-BMI-NEXT: retq +; X64-LABEL: bitselect_i8: +; X64: # %bb.0: +; X64-NEXT: andl %edx, %esi +; X64-NEXT: movl %edx, %eax +; X64-NEXT: notb %al +; X64-NEXT: andb %dil, %al +; X64-NEXT: orb %sil, %al +; X64-NEXT: retq %not = xor i8 %m, -1 %ma = and i8 %a, %not %mb = and i8 %b, %m @@ -43,20 +35,21 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind { define i16 @bitselect_i16(i16 %a, i16 %b, i16 %m) nounwind { ; X86-LABEL: bitselect_i16: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: xorw %cx, %ax -; X86-NEXT: andw {{[0-9]+}}(%esp), %ax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: xorw %ax, %cx +; X86-NEXT: andw {{[0-9]+}}(%esp), %cx ; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-NOBMI-LABEL: bitselect_i16: ; X64-NOBMI: # %bb.0: -; X64-NOBMI-NEXT: movl %esi, %eax -; X64-NOBMI-NEXT: xorl %edi, %eax -; X64-NOBMI-NEXT: andl %edx, %eax -; X64-NOBMI-NEXT: xorl %edi, %eax +; X64-NOBMI-NEXT: movl %edx, %eax +; X64-NOBMI-NEXT: andl %edx, %esi +; X64-NOBMI-NEXT: notl %eax +; X64-NOBMI-NEXT: andl %edi, %eax +; X64-NOBMI-NEXT: orl %esi, %eax ; X64-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NOBMI-NEXT: retq ; @@ -193,12 +186,13 @@ define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind { ; ; X64-BMI-LABEL: bitselect_i128: ; X64-BMI: # %bb.0: +; X64-BMI-NEXT: andnq %rsi, %r9, %rsi ; X64-BMI-NEXT: andnq %rdi, %r8, %rax +; X64-BMI-NEXT: andq %r9, %rcx +; X64-BMI-NEXT: orq %rcx, %rsi ; X64-BMI-NEXT: andq %r8, %rdx ; X64-BMI-NEXT: orq %rdx, %rax -; X64-BMI-NEXT: andnq %rsi, %r9, %rdx -; X64-BMI-NEXT: andq %r9, %rcx -; X64-BMI-NEXT: orq %rcx, %rdx +; X64-BMI-NEXT: movq %rsi, %rdx ; X64-BMI-NEXT: retq %not = xor i128 %m, -1 %ma = and i128 %a, %not diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll index 4a4eecbdfb3f3..b2614c5fe0493 100644 --- a/llvm/test/CodeGen/X86/fold-masked-merge.ll +++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll @@ -30,17 +30,18 @@ define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) { define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) { ; NOBMI-LABEL: masked_merge1: ; NOBMI: # %bb.0: -; NOBMI-NEXT: movl %esi, %eax -; NOBMI-NEXT: xorl %edx, %eax -; NOBMI-NEXT: andl %edi, %eax -; NOBMI-NEXT: xorl %edx, %eax +; NOBMI-NEXT: movl %edi, %eax +; NOBMI-NEXT: andl %edi, %esi +; NOBMI-NEXT: notl %eax +; NOBMI-NEXT: andl %edx, %eax +; NOBMI-NEXT: orl %esi, %eax ; NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; NOBMI-NEXT: retq ; ; BMI-LABEL: masked_merge1: ; BMI: # %bb.0: -; BMI-NEXT: andnl %edx, %edi, %eax ; BMI-NEXT: andl %edi, %esi +; BMI-NEXT: andnl %edx, %edi, %eax ; BMI-NEXT: orl %esi, %eax ; BMI-NEXT: # kill: def $ax killed $ax killed $eax ; BMI-NEXT: retq @@ -52,11 +53,20 @@ define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) { } define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) { -; CHECK-LABEL: masked_merge2: -; CHECK: # %bb.0: -; CHECK-NEXT: movl %esi, %eax -; CHECK-NEXT: # kill: def $al killed $al killed $eax -; CHECK-NEXT: retq +; NOBMI-LABEL: masked_merge2: +; NOBMI: # %bb.0: +; NOBMI-NEXT: movl %esi, %eax +; NOBMI-NEXT: # kill: def $al killed $al killed $eax +; NOBMI-NEXT: retq +; +; BMI-LABEL: masked_merge2: +; BMI: # %bb.0: +; BMI-NEXT: movl %edi, %eax +; BMI-NEXT: notb %al +; BMI-NEXT: andb %sil, %al +; BMI-NEXT: andb %dil, %sil +; BMI-NEXT: orb %sil, %al +; BMI-NEXT: retq %not = xor i8 %a0, -1 %and0 = and i8 %not, %a1 %and1 = and i8 %a1, %a0 diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll index 6a55d740fe421..9c9d06921096c 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll @@ -6,18 +6,21 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) { ; CHECK-NOBMI-LABEL: out8: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %edi, %eax -; CHECK-NOBMI-NEXT: xorl %esi, %eax -; CHECK-NOBMI-NEXT: andl %edx, %eax -; CHECK-NOBMI-NEXT: xorl %esi, %eax +; CHECK-NOBMI-NEXT: movl %edx, %eax +; CHECK-NOBMI-NEXT: andl %edx, %edi +; CHECK-NOBMI-NEXT: notb %al +; CHECK-NOBMI-NEXT: andb %sil, %al +; CHECK-NOBMI-NEXT: orb %dil, %al ; CHECK-NOBMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out8: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: andnl %esi, %edx, %eax +; CHECK-BMI-NEXT: movl %edx, %eax ; CHECK-BMI-NEXT: andl %edx, %edi -; CHECK-BMI-NEXT: orl %edi, %eax +; CHECK-BMI-NEXT: notb %al +; CHECK-BMI-NEXT: andb %sil, %al +; CHECK-BMI-NEXT: orb %dil, %al ; CHECK-BMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-BMI-NEXT: retq %mx = and i8 %x, %mask @@ -30,17 +33,18 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) { define i16 @out16(i16 %x, i16 %y, i16 %mask) { ; CHECK-NOBMI-LABEL: out16: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %edi, %eax -; CHECK-NOBMI-NEXT: xorl %esi, %eax -; CHECK-NOBMI-NEXT: andl %edx, %eax -; CHECK-NOBMI-NEXT: xorl %esi, %eax +; CHECK-NOBMI-NEXT: movl %edx, %eax +; CHECK-NOBMI-NEXT: andl %edx, %edi +; CHECK-NOBMI-NEXT: notl %eax +; CHECK-NOBMI-NEXT: andl %esi, %eax +; CHECK-NOBMI-NEXT: orl %edi, %eax ; CHECK-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out16: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: andnl %esi, %edx, %eax ; CHECK-BMI-NEXT: andl %edx, %edi +; CHECK-BMI-NEXT: andnl %esi, %edx, %eax ; CHECK-BMI-NEXT: orl %edi, %eax ; CHECK-BMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-BMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll index 809c15881cc9b..b1194bedc4e1c 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -16,10 +16,11 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-LABEL: out_v1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: xorl %esi, %eax -; CHECK-NEXT: andl %edx, %eax -; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: andl %edx, %edi +; CHECK-NEXT: notb %al +; CHECK-NEXT: andb %sil, %al +; CHECK-NEXT: orb %dil, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %mx = and <1 x i8> %x, %mask @@ -36,28 +37,32 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { ; CHECK-BASELINE-LABEL: out_v2i8: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: movl %edi, %eax -; CHECK-BASELINE-NEXT: xorl %edx, %eax -; CHECK-BASELINE-NEXT: andl %r8d, %eax -; CHECK-BASELINE-NEXT: xorl %edx, %eax -; CHECK-BASELINE-NEXT: xorl %ecx, %esi +; CHECK-BASELINE-NEXT: movl %r8d, %eax ; CHECK-BASELINE-NEXT: andl %r9d, %esi -; CHECK-BASELINE-NEXT: xorl %ecx, %esi +; CHECK-BASELINE-NEXT: andl %r8d, %edi +; CHECK-BASELINE-NEXT: notb %al +; CHECK-BASELINE-NEXT: notb %r9b +; CHECK-BASELINE-NEXT: andb %cl, %r9b +; CHECK-BASELINE-NEXT: andb %dl, %al +; CHECK-BASELINE-NEXT: orb %dil, %al +; CHECK-BASELINE-NEXT: orb %sil, %r9b ; CHECK-BASELINE-NEXT: # kill: def $al killed $al killed $eax -; CHECK-BASELINE-NEXT: movl %esi, %edx +; CHECK-BASELINE-NEXT: movl %r9d, %edx ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v2i8: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movl %edi, %eax -; CHECK-SSE1-NEXT: xorl %edx, %eax -; CHECK-SSE1-NEXT: andl %r8d, %eax -; CHECK-SSE1-NEXT: xorl %edx, %eax -; CHECK-SSE1-NEXT: xorl %ecx, %esi +; CHECK-SSE1-NEXT: movl %r8d, %eax ; CHECK-SSE1-NEXT: andl %r9d, %esi -; CHECK-SSE1-NEXT: xorl %ecx, %esi +; CHECK-SSE1-NEXT: andl %r8d, %edi +; CHECK-SSE1-NEXT: notb %al +; CHECK-SSE1-NEXT: notb %r9b +; CHECK-SSE1-NEXT: andb %cl, %r9b +; CHECK-SSE1-NEXT: andb %dl, %al +; CHECK-SSE1-NEXT: orb %dil, %al +; CHECK-SSE1-NEXT: orb %sil, %r9b ; CHECK-SSE1-NEXT: # kill: def $al killed $al killed $eax -; CHECK-SSE1-NEXT: movl %esi, %edx +; CHECK-SSE1-NEXT: movl %r9d, %edx ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v2i8: @@ -81,10 +86,11 @@ define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { ; CHECK-LABEL: out_v1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: xorl %esi, %eax -; CHECK-NEXT: andl %edx, %eax -; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: movl %edx, %eax +; CHECK-NEXT: andl %edx, %edi +; CHECK-NEXT: notl %eax +; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: orl %edi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq %mx = and <1 x i16> %x, %mask @@ -229,28 +235,32 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind { ; CHECK-BASELINE-LABEL: out_v2i16: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: movl %edi, %eax -; CHECK-BASELINE-NEXT: xorl %edx, %eax -; CHECK-BASELINE-NEXT: andl %r8d, %eax -; CHECK-BASELINE-NEXT: xorl %edx, %eax -; CHECK-BASELINE-NEXT: xorl %ecx, %esi +; CHECK-BASELINE-NEXT: movl %r8d, %eax ; CHECK-BASELINE-NEXT: andl %r9d, %esi -; CHECK-BASELINE-NEXT: xorl %ecx, %esi +; CHECK-BASELINE-NEXT: andl %r8d, %edi +; CHECK-BASELINE-NEXT: notl %eax +; CHECK-BASELINE-NEXT: notl %r9d +; CHECK-BASELINE-NEXT: andl %ecx, %r9d +; CHECK-BASELINE-NEXT: orl %esi, %r9d +; CHECK-BASELINE-NEXT: andl %edx, %eax +; CHECK-BASELINE-NEXT: orl %edi, %eax ; CHECK-BASELINE-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-BASELINE-NEXT: movl %esi, %edx +; CHECK-BASELINE-NEXT: movl %r9d, %edx ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v2i16: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movl %edi, %eax -; CHECK-SSE1-NEXT: xorl %edx, %eax -; CHECK-SSE1-NEXT: andl %r8d, %eax -; CHECK-SSE1-NEXT: xorl %edx, %eax -; CHECK-SSE1-NEXT: xorl %ecx, %esi +; CHECK-SSE1-NEXT: movl %r8d, %eax ; CHECK-SSE1-NEXT: andl %r9d, %esi -; CHECK-SSE1-NEXT: xorl %ecx, %esi +; CHECK-SSE1-NEXT: andl %r8d, %edi +; CHECK-SSE1-NEXT: notl %eax +; CHECK-SSE1-NEXT: notl %r9d +; CHECK-SSE1-NEXT: andl %ecx, %r9d +; CHECK-SSE1-NEXT: orl %esi, %r9d +; CHECK-SSE1-NEXT: andl %edx, %eax +; CHECK-SSE1-NEXT: orl %edi, %eax ; CHECK-SSE1-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-SSE1-NEXT: movl %esi, %edx +; CHECK-SSE1-NEXT: movl %r9d, %edx ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v2i16: @@ -429,12 +439,9 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin ; CHECK-BASELINE-LABEL: out_v4i16: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: xorl %r9d, %esi -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si -; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d ; CHECK-BASELINE-NEXT: xorl %r11d, %edx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx ; CHECK-BASELINE-NEXT: xorl %r11d, %edx @@ -444,21 +451,21 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin ; CHECK-BASELINE-NEXT: xorl %edi, %r8d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w ; CHECK-BASELINE-NEXT: xorl %edi, %r8d +; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: movw %si, (%rax) ; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) ; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) ; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) -; CHECK-BASELINE-NEXT: movw %si, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i16: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: xorl %r9d, %esi -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si -; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d ; CHECK-SSE1-NEXT: xorl %r11d, %edx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx ; CHECK-SSE1-NEXT: xorl %r11d, %edx @@ -468,10 +475,13 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin ; CHECK-SSE1-NEXT: xorl %edi, %r8d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w ; CHECK-SSE1-NEXT: xorl %edi, %r8d +; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: movw %si, (%rax) ; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) ; CHECK-SSE1-NEXT: movw %cx, 4(%rax) ; CHECK-SSE1-NEXT: movw %dx, 2(%rax) -; CHECK-SSE1-NEXT: movw %si, (%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i16: @@ -496,43 +506,43 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n ; CHECK-BASELINE-LABEL: out_v4i16_undef: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-BASELINE-NEXT: xorl %r9d, %esi -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si -; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: xorl %r10d, %edx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx ; CHECK-BASELINE-NEXT: xorl %r10d, %edx ; CHECK-BASELINE-NEXT: xorl %edi, %r8d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w ; CHECK-BASELINE-NEXT: xorl %edi, %r8d +; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) +; CHECK-BASELINE-NEXT: movw %si, (%rax) ; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) ; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) -; CHECK-BASELINE-NEXT: movw %si, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i16_undef: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx -; CHECK-SSE1-NEXT: xorl %r9d, %esi -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si -; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: xorl %r10d, %edx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx ; CHECK-SSE1-NEXT: xorl %r10d, %edx ; CHECK-SSE1-NEXT: xorl %edi, %r8d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w ; CHECK-SSE1-NEXT: xorl %edi, %r8d +; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: movw %cx, 4(%rax) +; CHECK-SSE1-NEXT: movw %si, (%rax) ; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) ; CHECK-SSE1-NEXT: movw %dx, 2(%rax) -; CHECK-SSE1-NEXT: movw %si, (%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i16_undef: @@ -873,14 +883,14 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebp -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r14d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r15d -; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r12d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d ; CHECK-BASELINE-NEXT: xorl %r12d, %esi ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si ; CHECK-BASELINE-NEXT: xorl %r12d, %esi @@ -896,16 +906,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin ; CHECK-BASELINE-NEXT: xorl %ebx, %r9d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r9w ; CHECK-BASELINE-NEXT: xorl %ebx, %r9d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: xorw %r11w, %bx +; CHECK-BASELINE-NEXT: movl %r11d, %ebx +; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %bx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx ; CHECK-BASELINE-NEXT: xorl %r11d, %ebx -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: xorw %r10w, %r11w +; CHECK-BASELINE-NEXT: movl %r10d, %r11d +; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %r11w ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w ; CHECK-BASELINE-NEXT: xorl %r10d, %r11d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: xorw %di, %r10w +; CHECK-BASELINE-NEXT: movl %edi, %r10d +; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %r10w ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w ; CHECK-BASELINE-NEXT: xorl %edi, %r10d ; CHECK-BASELINE-NEXT: movw %r10w, 14(%rax) @@ -931,14 +941,14 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebp -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r14d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r15d -; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r12d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d ; CHECK-SSE1-NEXT: xorl %r12d, %esi ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si ; CHECK-SSE1-NEXT: xorl %r12d, %esi @@ -954,16 +964,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin ; CHECK-SSE1-NEXT: xorl %ebx, %r9d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r9w ; CHECK-SSE1-NEXT: xorl %ebx, %r9d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: xorw %r11w, %bx +; CHECK-SSE1-NEXT: movl %r11d, %ebx +; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %bx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx ; CHECK-SSE1-NEXT: xorl %r11d, %ebx -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: xorw %r10w, %r11w +; CHECK-SSE1-NEXT: movl %r10d, %r11d +; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %r11w ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w ; CHECK-SSE1-NEXT: xorl %r10d, %r11d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: xorw %di, %r10w +; CHECK-SSE1-NEXT: movl %edi, %r10d +; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %r10w ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w ; CHECK-SSE1-NEXT: xorl %edi, %r10d ; CHECK-SSE1-NEXT: movw %r10w, 14(%rax) @@ -1749,117 +1759,113 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movq %rcx, %r10 -; CHECK-BASELINE-NEXT: movq %rdx, %r8 -; CHECK-BASELINE-NEXT: movq %rsi, %r9 -; CHECK-BASELINE-NEXT: movq %rdi, %r11 -; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %ebp -; CHECK-BASELINE-NEXT: movl 16(%rdx), %r15d -; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %r13d -; CHECK-BASELINE-NEXT: movl 12(%rdx), %r12d -; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r14d -; CHECK-BASELINE-NEXT: movl 8(%rdx), %ebx -; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %eax -; CHECK-BASELINE-NEXT: movl (%rdx), %ecx -; CHECK-BASELINE-NEXT: movl 4(%rdx), %edx -; CHECK-BASELINE-NEXT: movzwl 2(%r8), %esi -; CHECK-BASELINE-NEXT: movzwl (%r9), %edi -; CHECK-BASELINE-NEXT: xorw %cx, %di -; CHECK-BASELINE-NEXT: andw (%r10), %di -; CHECK-BASELINE-NEXT: xorl %ecx, %edi -; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 2(%r9), %ecx -; CHECK-BASELINE-NEXT: xorw %si, %cx -; CHECK-BASELINE-NEXT: andw 2(%r10), %cx -; CHECK-BASELINE-NEXT: xorl %esi, %ecx -; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 4(%r9), %ecx -; CHECK-BASELINE-NEXT: xorw %dx, %cx -; CHECK-BASELINE-NEXT: andw 4(%r10), %cx -; CHECK-BASELINE-NEXT: xorl %edx, %ecx -; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 6(%r9), %ecx -; CHECK-BASELINE-NEXT: xorw %ax, %cx -; CHECK-BASELINE-NEXT: andw 6(%r10), %cx -; CHECK-BASELINE-NEXT: xorl %eax, %ecx -; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 8(%r9), %eax +; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r15d +; CHECK-BASELINE-NEXT: movzwl 16(%rdx), %r14d +; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %ebp +; CHECK-BASELINE-NEXT: movzwl 12(%rdx), %ebx +; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r13d +; CHECK-BASELINE-NEXT: movzwl 8(%rdx), %r11d +; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %r10d +; CHECK-BASELINE-NEXT: movzwl 4(%rdx), %r9d +; CHECK-BASELINE-NEXT: movzwl (%rdx), %r8d +; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %r12d +; CHECK-BASELINE-NEXT: movzwl (%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r8w, %ax +; CHECK-BASELINE-NEXT: andw (%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r8d +; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r12w, %ax +; CHECK-BASELINE-NEXT: andw 2(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r12d +; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r9w, %ax +; CHECK-BASELINE-NEXT: andw 4(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r9d +; CHECK-BASELINE-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r10w, %ax +; CHECK-BASELINE-NEXT: andw 6(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r10d +; CHECK-BASELINE-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r11w, %ax +; CHECK-BASELINE-NEXT: andw 8(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r11d +; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r13w, %ax +; CHECK-BASELINE-NEXT: andw 10(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r13d +; CHECK-BASELINE-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %eax ; CHECK-BASELINE-NEXT: xorw %bx, %ax -; CHECK-BASELINE-NEXT: andw 8(%r10), %ax -; CHECK-BASELINE-NEXT: xorl %ebx, %eax -; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 10(%r9), %ebx -; CHECK-BASELINE-NEXT: xorw %r14w, %bx -; CHECK-BASELINE-NEXT: andw 10(%r10), %bx -; CHECK-BASELINE-NEXT: xorl %r14d, %ebx -; CHECK-BASELINE-NEXT: movzwl 12(%r9), %r14d -; CHECK-BASELINE-NEXT: xorw %r12w, %r14w -; CHECK-BASELINE-NEXT: andw 12(%r10), %r14w -; CHECK-BASELINE-NEXT: xorl %r12d, %r14d -; CHECK-BASELINE-NEXT: movzwl 14(%r9), %r12d -; CHECK-BASELINE-NEXT: xorw %r13w, %r12w -; CHECK-BASELINE-NEXT: andw 14(%r10), %r12w -; CHECK-BASELINE-NEXT: xorl %r13d, %r12d -; CHECK-BASELINE-NEXT: movzwl 16(%r9), %r13d -; CHECK-BASELINE-NEXT: xorw %r15w, %r13w -; CHECK-BASELINE-NEXT: andw 16(%r10), %r13w -; CHECK-BASELINE-NEXT: xorl %r15d, %r13d -; CHECK-BASELINE-NEXT: movzwl 18(%r9), %r15d -; CHECK-BASELINE-NEXT: xorw %bp, %r15w -; CHECK-BASELINE-NEXT: andw 18(%r10), %r15w -; CHECK-BASELINE-NEXT: xorl %ebp, %r15d -; CHECK-BASELINE-NEXT: movl 20(%r8), %eax -; CHECK-BASELINE-NEXT: movzwl 20(%r9), %ebp -; CHECK-BASELINE-NEXT: xorw %ax, %bp -; CHECK-BASELINE-NEXT: andw 20(%r10), %bp +; CHECK-BASELINE-NEXT: andw 12(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %ebx +; CHECK-BASELINE-NEXT: movzwl 14(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %bp, %ax +; CHECK-BASELINE-NEXT: andw 14(%rcx), %ax ; CHECK-BASELINE-NEXT: xorl %eax, %ebp -; CHECK-BASELINE-NEXT: movzwl 22(%r8), %eax -; CHECK-BASELINE-NEXT: movzwl 22(%r9), %esi -; CHECK-BASELINE-NEXT: xorw %ax, %si -; CHECK-BASELINE-NEXT: andw 22(%r10), %si -; CHECK-BASELINE-NEXT: xorl %eax, %esi -; CHECK-BASELINE-NEXT: movl 24(%r8), %eax -; CHECK-BASELINE-NEXT: movzwl 24(%r9), %edx -; CHECK-BASELINE-NEXT: xorw %ax, %dx -; CHECK-BASELINE-NEXT: andw 24(%r10), %dx -; CHECK-BASELINE-NEXT: xorl %eax, %edx -; CHECK-BASELINE-NEXT: movzwl 26(%r8), %eax -; CHECK-BASELINE-NEXT: movzwl 26(%r9), %ecx -; CHECK-BASELINE-NEXT: xorw %ax, %cx -; CHECK-BASELINE-NEXT: andw 26(%r10), %cx -; CHECK-BASELINE-NEXT: xorl %eax, %ecx -; CHECK-BASELINE-NEXT: movl 28(%r8), %edi -; CHECK-BASELINE-NEXT: movzwl 28(%r9), %eax -; CHECK-BASELINE-NEXT: xorw %di, %ax -; CHECK-BASELINE-NEXT: andw 28(%r10), %ax -; CHECK-BASELINE-NEXT: xorl %edi, %eax -; CHECK-BASELINE-NEXT: movzwl 30(%r8), %edi -; CHECK-BASELINE-NEXT: movzwl 30(%r9), %r8d -; CHECK-BASELINE-NEXT: xorw %di, %r8w -; CHECK-BASELINE-NEXT: andw 30(%r10), %r8w -; CHECK-BASELINE-NEXT: xorl %edi, %r8d -; CHECK-BASELINE-NEXT: movw %r8w, 30(%r11) -; CHECK-BASELINE-NEXT: movw %ax, 28(%r11) -; CHECK-BASELINE-NEXT: movw %cx, 26(%r11) -; CHECK-BASELINE-NEXT: movw %dx, 24(%r11) -; CHECK-BASELINE-NEXT: movw %si, 22(%r11) -; CHECK-BASELINE-NEXT: movw %bp, 20(%r11) -; CHECK-BASELINE-NEXT: movw %r15w, 18(%r11) -; CHECK-BASELINE-NEXT: movw %r13w, 16(%r11) -; CHECK-BASELINE-NEXT: movw %r12w, 14(%r11) -; CHECK-BASELINE-NEXT: movw %r14w, 12(%r11) -; CHECK-BASELINE-NEXT: movw %bx, 10(%r11) +; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r14w, %ax +; CHECK-BASELINE-NEXT: andw 16(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r14d +; CHECK-BASELINE-NEXT: movzwl 18(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r15w, %ax +; CHECK-BASELINE-NEXT: andw 18(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r15d +; CHECK-BASELINE-NEXT: movzwl 20(%rdx), %r13d +; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r13w, %ax +; CHECK-BASELINE-NEXT: andw 20(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r13d +; CHECK-BASELINE-NEXT: movzwl 22(%rdx), %r9d +; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r9w, %ax +; CHECK-BASELINE-NEXT: andw 22(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r9d +; CHECK-BASELINE-NEXT: movzwl 24(%rdx), %r8d +; CHECK-BASELINE-NEXT: movzwl 24(%rsi), %eax +; CHECK-BASELINE-NEXT: xorw %r8w, %ax +; CHECK-BASELINE-NEXT: andw 24(%rcx), %ax +; CHECK-BASELINE-NEXT: xorl %eax, %r8d +; CHECK-BASELINE-NEXT: movzwl 26(%rdx), %eax +; CHECK-BASELINE-NEXT: movzwl 26(%rsi), %r10d +; CHECK-BASELINE-NEXT: xorw %ax, %r10w +; CHECK-BASELINE-NEXT: andw 26(%rcx), %r10w +; CHECK-BASELINE-NEXT: xorl %r10d, %eax +; CHECK-BASELINE-NEXT: movzwl 28(%rdx), %r10d +; CHECK-BASELINE-NEXT: movzwl 28(%rsi), %r11d +; CHECK-BASELINE-NEXT: xorw %r10w, %r11w +; CHECK-BASELINE-NEXT: andw 28(%rcx), %r11w +; CHECK-BASELINE-NEXT: xorl %r11d, %r10d +; CHECK-BASELINE-NEXT: movzwl 30(%rdx), %edx +; CHECK-BASELINE-NEXT: movzwl 30(%rsi), %esi +; CHECK-BASELINE-NEXT: xorw %dx, %si +; CHECK-BASELINE-NEXT: andw 30(%rcx), %si +; CHECK-BASELINE-NEXT: xorl %esi, %edx +; CHECK-BASELINE-NEXT: movw %dx, 30(%rdi) +; CHECK-BASELINE-NEXT: movw %r10w, 28(%rdi) +; CHECK-BASELINE-NEXT: movw %ax, 26(%rdi) +; CHECK-BASELINE-NEXT: movw %r8w, 24(%rdi) +; CHECK-BASELINE-NEXT: movw %r9w, 22(%rdi) +; CHECK-BASELINE-NEXT: movw %r13w, 20(%rdi) +; CHECK-BASELINE-NEXT: movw %r15w, 18(%rdi) +; CHECK-BASELINE-NEXT: movw %r14w, 16(%rdi) +; CHECK-BASELINE-NEXT: movw %bp, 14(%rdi) +; CHECK-BASELINE-NEXT: movw %bx, 12(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 8(%r11) +; CHECK-BASELINE-NEXT: movw %ax, 10(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 6(%r11) +; CHECK-BASELINE-NEXT: movw %ax, 8(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 4(%r11) +; CHECK-BASELINE-NEXT: movw %ax, 6(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 2(%r11) +; CHECK-BASELINE-NEXT: movw %ax, 4(%rdi) +; CHECK-BASELINE-NEXT: movw %r12w, 2(%rdi) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, (%r11) -; CHECK-BASELINE-NEXT: movq %r11, %rax +; CHECK-BASELINE-NEXT: movw %ax, (%rdi) +; CHECK-BASELINE-NEXT: movq %rdi, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -1876,117 +1882,113 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movq %rcx, %r10 -; CHECK-SSE1-NEXT: movq %rdx, %r8 -; CHECK-SSE1-NEXT: movq %rsi, %r9 -; CHECK-SSE1-NEXT: movq %rdi, %r11 -; CHECK-SSE1-NEXT: movzwl 18(%rdx), %ebp -; CHECK-SSE1-NEXT: movl 16(%rdx), %r15d -; CHECK-SSE1-NEXT: movzwl 14(%rdx), %r13d -; CHECK-SSE1-NEXT: movl 12(%rdx), %r12d -; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r14d -; CHECK-SSE1-NEXT: movl 8(%rdx), %ebx -; CHECK-SSE1-NEXT: movzwl 6(%rdx), %eax -; CHECK-SSE1-NEXT: movl (%rdx), %ecx -; CHECK-SSE1-NEXT: movl 4(%rdx), %edx -; CHECK-SSE1-NEXT: movzwl 2(%r8), %esi -; CHECK-SSE1-NEXT: movzwl (%r9), %edi -; CHECK-SSE1-NEXT: xorw %cx, %di -; CHECK-SSE1-NEXT: andw (%r10), %di -; CHECK-SSE1-NEXT: xorl %ecx, %edi -; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 2(%r9), %ecx -; CHECK-SSE1-NEXT: xorw %si, %cx -; CHECK-SSE1-NEXT: andw 2(%r10), %cx -; CHECK-SSE1-NEXT: xorl %esi, %ecx -; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 4(%r9), %ecx -; CHECK-SSE1-NEXT: xorw %dx, %cx -; CHECK-SSE1-NEXT: andw 4(%r10), %cx -; CHECK-SSE1-NEXT: xorl %edx, %ecx -; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 6(%r9), %ecx -; CHECK-SSE1-NEXT: xorw %ax, %cx -; CHECK-SSE1-NEXT: andw 6(%r10), %cx -; CHECK-SSE1-NEXT: xorl %eax, %ecx -; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 8(%r9), %eax +; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r15d +; CHECK-SSE1-NEXT: movzwl 16(%rdx), %r14d +; CHECK-SSE1-NEXT: movzwl 14(%rdx), %ebp +; CHECK-SSE1-NEXT: movzwl 12(%rdx), %ebx +; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r13d +; CHECK-SSE1-NEXT: movzwl 8(%rdx), %r11d +; CHECK-SSE1-NEXT: movzwl 6(%rdx), %r10d +; CHECK-SSE1-NEXT: movzwl 4(%rdx), %r9d +; CHECK-SSE1-NEXT: movzwl (%rdx), %r8d +; CHECK-SSE1-NEXT: movzwl 2(%rdx), %r12d +; CHECK-SSE1-NEXT: movzwl (%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r8w, %ax +; CHECK-SSE1-NEXT: andw (%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r8d +; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 2(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r12w, %ax +; CHECK-SSE1-NEXT: andw 2(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r12d +; CHECK-SSE1-NEXT: movzwl 4(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r9w, %ax +; CHECK-SSE1-NEXT: andw 4(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r9d +; CHECK-SSE1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 6(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r10w, %ax +; CHECK-SSE1-NEXT: andw 6(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r10d +; CHECK-SSE1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 8(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r11w, %ax +; CHECK-SSE1-NEXT: andw 8(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r11d +; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 10(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r13w, %ax +; CHECK-SSE1-NEXT: andw 10(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r13d +; CHECK-SSE1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 12(%rsi), %eax ; CHECK-SSE1-NEXT: xorw %bx, %ax -; CHECK-SSE1-NEXT: andw 8(%r10), %ax -; CHECK-SSE1-NEXT: xorl %ebx, %eax -; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 10(%r9), %ebx -; CHECK-SSE1-NEXT: xorw %r14w, %bx -; CHECK-SSE1-NEXT: andw 10(%r10), %bx -; CHECK-SSE1-NEXT: xorl %r14d, %ebx -; CHECK-SSE1-NEXT: movzwl 12(%r9), %r14d -; CHECK-SSE1-NEXT: xorw %r12w, %r14w -; CHECK-SSE1-NEXT: andw 12(%r10), %r14w -; CHECK-SSE1-NEXT: xorl %r12d, %r14d -; CHECK-SSE1-NEXT: movzwl 14(%r9), %r12d -; CHECK-SSE1-NEXT: xorw %r13w, %r12w -; CHECK-SSE1-NEXT: andw 14(%r10), %r12w -; CHECK-SSE1-NEXT: xorl %r13d, %r12d -; CHECK-SSE1-NEXT: movzwl 16(%r9), %r13d -; CHECK-SSE1-NEXT: xorw %r15w, %r13w -; CHECK-SSE1-NEXT: andw 16(%r10), %r13w -; CHECK-SSE1-NEXT: xorl %r15d, %r13d -; CHECK-SSE1-NEXT: movzwl 18(%r9), %r15d -; CHECK-SSE1-NEXT: xorw %bp, %r15w -; CHECK-SSE1-NEXT: andw 18(%r10), %r15w -; CHECK-SSE1-NEXT: xorl %ebp, %r15d -; CHECK-SSE1-NEXT: movl 20(%r8), %eax -; CHECK-SSE1-NEXT: movzwl 20(%r9), %ebp -; CHECK-SSE1-NEXT: xorw %ax, %bp -; CHECK-SSE1-NEXT: andw 20(%r10), %bp +; CHECK-SSE1-NEXT: andw 12(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %ebx +; CHECK-SSE1-NEXT: movzwl 14(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %bp, %ax +; CHECK-SSE1-NEXT: andw 14(%rcx), %ax ; CHECK-SSE1-NEXT: xorl %eax, %ebp -; CHECK-SSE1-NEXT: movzwl 22(%r8), %eax -; CHECK-SSE1-NEXT: movzwl 22(%r9), %esi -; CHECK-SSE1-NEXT: xorw %ax, %si -; CHECK-SSE1-NEXT: andw 22(%r10), %si -; CHECK-SSE1-NEXT: xorl %eax, %esi -; CHECK-SSE1-NEXT: movl 24(%r8), %eax -; CHECK-SSE1-NEXT: movzwl 24(%r9), %edx -; CHECK-SSE1-NEXT: xorw %ax, %dx -; CHECK-SSE1-NEXT: andw 24(%r10), %dx -; CHECK-SSE1-NEXT: xorl %eax, %edx -; CHECK-SSE1-NEXT: movzwl 26(%r8), %eax -; CHECK-SSE1-NEXT: movzwl 26(%r9), %ecx -; CHECK-SSE1-NEXT: xorw %ax, %cx -; CHECK-SSE1-NEXT: andw 26(%r10), %cx -; CHECK-SSE1-NEXT: xorl %eax, %ecx -; CHECK-SSE1-NEXT: movl 28(%r8), %edi -; CHECK-SSE1-NEXT: movzwl 28(%r9), %eax -; CHECK-SSE1-NEXT: xorw %di, %ax -; CHECK-SSE1-NEXT: andw 28(%r10), %ax -; CHECK-SSE1-NEXT: xorl %edi, %eax -; CHECK-SSE1-NEXT: movzwl 30(%r8), %edi -; CHECK-SSE1-NEXT: movzwl 30(%r9), %r8d -; CHECK-SSE1-NEXT: xorw %di, %r8w -; CHECK-SSE1-NEXT: andw 30(%r10), %r8w -; CHECK-SSE1-NEXT: xorl %edi, %r8d -; CHECK-SSE1-NEXT: movw %r8w, 30(%r11) -; CHECK-SSE1-NEXT: movw %ax, 28(%r11) -; CHECK-SSE1-NEXT: movw %cx, 26(%r11) -; CHECK-SSE1-NEXT: movw %dx, 24(%r11) -; CHECK-SSE1-NEXT: movw %si, 22(%r11) -; CHECK-SSE1-NEXT: movw %bp, 20(%r11) -; CHECK-SSE1-NEXT: movw %r15w, 18(%r11) -; CHECK-SSE1-NEXT: movw %r13w, 16(%r11) -; CHECK-SSE1-NEXT: movw %r12w, 14(%r11) -; CHECK-SSE1-NEXT: movw %r14w, 12(%r11) -; CHECK-SSE1-NEXT: movw %bx, 10(%r11) +; CHECK-SSE1-NEXT: movzwl 16(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r14w, %ax +; CHECK-SSE1-NEXT: andw 16(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r14d +; CHECK-SSE1-NEXT: movzwl 18(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r15w, %ax +; CHECK-SSE1-NEXT: andw 18(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r15d +; CHECK-SSE1-NEXT: movzwl 20(%rdx), %r13d +; CHECK-SSE1-NEXT: movzwl 20(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r13w, %ax +; CHECK-SSE1-NEXT: andw 20(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r13d +; CHECK-SSE1-NEXT: movzwl 22(%rdx), %r9d +; CHECK-SSE1-NEXT: movzwl 22(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r9w, %ax +; CHECK-SSE1-NEXT: andw 22(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r9d +; CHECK-SSE1-NEXT: movzwl 24(%rdx), %r8d +; CHECK-SSE1-NEXT: movzwl 24(%rsi), %eax +; CHECK-SSE1-NEXT: xorw %r8w, %ax +; CHECK-SSE1-NEXT: andw 24(%rcx), %ax +; CHECK-SSE1-NEXT: xorl %eax, %r8d +; CHECK-SSE1-NEXT: movzwl 26(%rdx), %eax +; CHECK-SSE1-NEXT: movzwl 26(%rsi), %r10d +; CHECK-SSE1-NEXT: xorw %ax, %r10w +; CHECK-SSE1-NEXT: andw 26(%rcx), %r10w +; CHECK-SSE1-NEXT: xorl %r10d, %eax +; CHECK-SSE1-NEXT: movzwl 28(%rdx), %r10d +; CHECK-SSE1-NEXT: movzwl 28(%rsi), %r11d +; CHECK-SSE1-NEXT: xorw %r10w, %r11w +; CHECK-SSE1-NEXT: andw 28(%rcx), %r11w +; CHECK-SSE1-NEXT: xorl %r11d, %r10d +; CHECK-SSE1-NEXT: movzwl 30(%rdx), %edx +; CHECK-SSE1-NEXT: movzwl 30(%rsi), %esi +; CHECK-SSE1-NEXT: xorw %dx, %si +; CHECK-SSE1-NEXT: andw 30(%rcx), %si +; CHECK-SSE1-NEXT: xorl %esi, %edx +; CHECK-SSE1-NEXT: movw %dx, 30(%rdi) +; CHECK-SSE1-NEXT: movw %r10w, 28(%rdi) +; CHECK-SSE1-NEXT: movw %ax, 26(%rdi) +; CHECK-SSE1-NEXT: movw %r8w, 24(%rdi) +; CHECK-SSE1-NEXT: movw %r9w, 22(%rdi) +; CHECK-SSE1-NEXT: movw %r13w, 20(%rdi) +; CHECK-SSE1-NEXT: movw %r15w, 18(%rdi) +; CHECK-SSE1-NEXT: movw %r14w, 16(%rdi) +; CHECK-SSE1-NEXT: movw %bp, 14(%rdi) +; CHECK-SSE1-NEXT: movw %bx, 12(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 8(%r11) +; CHECK-SSE1-NEXT: movw %ax, 10(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 6(%r11) +; CHECK-SSE1-NEXT: movw %ax, 8(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 4(%r11) +; CHECK-SSE1-NEXT: movw %ax, 6(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 2(%r11) +; CHECK-SSE1-NEXT: movw %ax, 4(%rdi) +; CHECK-SSE1-NEXT: movw %r12w, 2(%rdi) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, (%r11) -; CHECK-SSE1-NEXT: movq %r11, %rax +; CHECK-SSE1-NEXT: movw %ax, (%rdi) +; CHECK-SSE1-NEXT: movq %rdi, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 From 0a6463039da89914c7a0f99622fb7a008abde2fd Mon Sep 17 00:00:00 2001 From: Florian Mayer Date: Tue, 10 Jun 2025 19:48:09 -0700 Subject: [PATCH 024/851] [NFC] get rid of `undef` in avx512vl-intrinsics.ll test (#143641) --- llvm/test/CodeGen/X86/avx512vl-intrinsics.ll | 72 ++++++++++---------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll index 0973824fbb0ef..b408aac218108 100644 --- a/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -46,7 +46,7 @@ define <2 x double> @test_compress_pd_128(<2 x double> %data) { ; CHECK-LABEL: test_compress_pd_128: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> undef, <2 x i1> ) + %1 = call <2 x double> @llvm.x86.avx512.mask.compress.v2f64(<2 x double> %data, <2 x double> poison, <2 x i1> ) ret <2 x double> %1 } @@ -94,7 +94,7 @@ define <4 x float> @test_compress_ps_128(<4 x float> %data) { ; CHECK-LABEL: test_compress_ps_128: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> undef, <4 x i1> ) + %1 = call <4 x float> @llvm.x86.avx512.mask.compress.v4f32(<4 x float> %data, <4 x float> poison, <4 x i1> ) ret <4 x float> %1 } @@ -142,7 +142,7 @@ define <2 x i64> @test_compress_q_128(<2 x i64> %data) { ; CHECK-LABEL: test_compress_q_128: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> undef, <2 x i1> ) + %1 = call <2 x i64> @llvm.x86.avx512.mask.compress.v2i64(<2 x i64> %data, <2 x i64> poison, <2 x i1> ) ret <2 x i64> %1 } @@ -190,7 +190,7 @@ define <4 x i32> @test_compress_d_128(<4 x i32> %data) { ; CHECK-LABEL: test_compress_d_128: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> undef, <4 x i1> ) + %1 = call <4 x i32> @llvm.x86.avx512.mask.compress.v4i32(<4 x i32> %data, <4 x i32> poison, <4 x i1> ) ret <4 x i32> %1 } @@ -198,7 +198,7 @@ define <2 x double> @test_expand_pd_128(<2 x double> %data) { ; CHECK-LABEL: test_expand_pd_128: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> undef, <2 x i1> ) + %1 = call <2 x double> @llvm.x86.avx512.mask.expand.v2f64(<2 x double> %data, <2 x double> poison, <2 x i1> ) ret <2 x double> %1 } @@ -246,7 +246,7 @@ define <4 x float> @test_expand_ps_128(<4 x float> %data) { ; CHECK-LABEL: test_expand_ps_128: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> undef, <4 x i1> ) + %1 = call <4 x float> @llvm.x86.avx512.mask.expand.v4f32(<4 x float> %data, <4 x float> poison, <4 x i1> ) ret <4 x float> %1 } @@ -294,7 +294,7 @@ define <2 x i64> @test_expand_q_128(<2 x i64> %data) { ; CHECK-LABEL: test_expand_q_128: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> undef, <2 x i1> ) + %1 = call <2 x i64> @llvm.x86.avx512.mask.expand.v2i64(<2 x i64> %data, <2 x i64> poison, <2 x i1> ) ret <2 x i64> %1 } @@ -342,7 +342,7 @@ define <4 x i32> @test_expand_d_128(<4 x i32> %data) { ; CHECK-LABEL: test_expand_d_128: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> undef, <4 x i1> ) + %1 = call <4 x i32> @llvm.x86.avx512.mask.expand.v4i32(<4 x i32> %data, <4 x i32> poison, <4 x i1> ) ret <4 x i32> %1 } @@ -430,7 +430,7 @@ define <4 x double> @test_compress_pd_256(<4 x double> %data) { ; CHECK-LABEL: test_compress_pd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> undef, <4 x i1> ) + %1 = call <4 x double> @llvm.x86.avx512.mask.compress.v4f64(<4 x double> %data, <4 x double> poison, <4 x i1> ) ret <4 x double> %1 } @@ -476,7 +476,7 @@ define <8 x float> @test_compress_ps_256(<8 x float> %data) { ; CHECK-LABEL: test_compress_ps_256: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> undef, <8 x i1> ) + %1 = call <8 x float> @llvm.x86.avx512.mask.compress.v8f32(<8 x float> %data, <8 x float> poison, <8 x i1> ) ret <8 x float> %1 } @@ -524,7 +524,7 @@ define <4 x i64> @test_compress_q_256(<4 x i64> %data) { ; CHECK-LABEL: test_compress_q_256: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> undef, <4 x i1> ) + %1 = call <4 x i64> @llvm.x86.avx512.mask.compress.v4i64(<4 x i64> %data, <4 x i64> poison, <4 x i1> ) ret <4 x i64> %1 } @@ -570,7 +570,7 @@ define <8 x i32> @test_compress_d_256(<8 x i32> %data) { ; CHECK-LABEL: test_compress_d_256: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> undef, <8 x i1> ) + %1 = call <8 x i32> @llvm.x86.avx512.mask.compress.v8i32(<8 x i32> %data, <8 x i32> poison, <8 x i1> ) ret <8 x i32> %1 } @@ -578,7 +578,7 @@ define <4 x double> @test_expand_pd_256(<4 x double> %data) { ; CHECK-LABEL: test_expand_pd_256: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> undef, <4 x i1> ) + %1 = call <4 x double> @llvm.x86.avx512.mask.expand.v4f64(<4 x double> %data, <4 x double> poison, <4 x i1> ) ret <4 x double> %1 } @@ -626,7 +626,7 @@ define <8 x float> @test_expand_ps_256(<8 x float> %data) { ; CHECK-LABEL: test_expand_ps_256: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> undef, <8 x i1> ) + %1 = call <8 x float> @llvm.x86.avx512.mask.expand.v8f32(<8 x float> %data, <8 x float> poison, <8 x i1> ) ret <8 x float> %1 } @@ -672,7 +672,7 @@ define <4 x i64> @test_expand_q_256(<4 x i64> %data) { ; CHECK-LABEL: test_expand_q_256: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> undef, <4 x i1> ) + %1 = call <4 x i64> @llvm.x86.avx512.mask.expand.v4i64(<4 x i64> %data, <4 x i64> poison, <4 x i1> ) ret <4 x i64> %1 } @@ -720,7 +720,7 @@ define <8 x i32> @test_expand_d_256(<8 x i32> %data) { ; CHECK-LABEL: test_expand_d_256: ; CHECK: # %bb.0: ; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> undef, <8 x i1> ) + %1 = call <8 x i32> @llvm.x86.avx512.mask.expand.v8i32(<8 x i32> %data, <8 x i32> poison, <8 x i1> ) ret <8 x i32> %1 } @@ -884,7 +884,7 @@ define <4 x float> @test_mm512_maskz_max_ps_128(<4 x float> %a0, <4 x float> %a1 ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> + %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer ret <4 x float> %3 } @@ -906,7 +906,7 @@ define <4 x float> @test_mm512_mask_max_ps_128(<4 x float> %a0, <4 x float> %a1, ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x float> @llvm.x86.sse.max.ps(<4 x float> %a0, <4 x float> %a1) %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> + %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %src ret <4 x float> %3 } @@ -986,7 +986,7 @@ define <4 x float> @test_mm512_maskz_min_ps_128(<4 x float> %a0, <4 x float> %a1 ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> + %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> zeroinitializer ret <4 x float> %3 } @@ -1008,7 +1008,7 @@ define <4 x float> @test_mm512_mask_min_ps_128(<4 x float> %a0, <4 x float> %a1, ; X64-NEXT: retq # encoding: [0xc3] %1 = call <4 x float> @llvm.x86.sse.min.ps(<4 x float> %a0, <4 x float> %a1) %2 = bitcast i8 %mask to <8 x i1> - %extract = shufflevector <8 x i1> %2, <8 x i1> undef, <4 x i32> + %extract = shufflevector <8 x i1> %2, <8 x i1> poison, <4 x i32> %3 = select <4 x i1> %extract, <4 x float> %1, <4 x float> %src ret <4 x float> %3 } @@ -5223,7 +5223,7 @@ define <2 x i64> @test_x86_avx512_mask_psra_q_128(<2 x i64> %a0, <2 x i64> %a1, ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> - %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %passthru ret <2 x i64> %res2 } @@ -5242,7 +5242,7 @@ define <2 x i64> @test_x86_avx512_maskz_psra_q_128(<2 x i64> %a0, <2 x i64> %a1, ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.psra.q.128(<2 x i64> %a0, <2 x i64> %a1) ; <<2 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> - %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer ret <2 x i64> %res2 } @@ -5274,7 +5274,7 @@ define <4 x i64> @test_x86_avx512_mask_psra_q_256(<4 x i64> %a0, <2 x i64> %a1, ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> - %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %passthru ret <4 x i64> %res2 } @@ -5293,7 +5293,7 @@ define <4 x i64> @test_x86_avx512_maskz_psra_q_256(<4 x i64> %a0, <2 x i64> %a1, ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.psra.q.256(<4 x i64> %a0, <2 x i64> %a1) ; <<4 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> - %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer ret <4 x i64> %res2 } @@ -5325,7 +5325,7 @@ define <2 x i64> @test_x86_avx512_mask_psrai_q_128(<2 x i64> %a0, <2 x i64> %pas ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> - %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %passthru ret <2 x i64> %res2 } @@ -5344,7 +5344,7 @@ define <2 x i64> @test_x86_avx512_maskz_psrai_q_128(<2 x i64> %a0, i8 %mask) { ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.psrai.q.128(<2 x i64> %a0, i32 7) ; <<2 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> - %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer ret <2 x i64> %res2 } @@ -5376,7 +5376,7 @@ define <4 x i64> @test_x86_avx512_mask_psrai_q_256(<4 x i64> %a0, <4 x i64> %pas ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> - %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %passthru ret <4 x i64> %res2 } @@ -5395,7 +5395,7 @@ define <4 x i64> @test_x86_avx512_maskz_psrai_q_256(<4 x i64> %a0, i8 %mask) { ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.psrai.q.256(<4 x i64> %a0, i32 7) ; <<4 x i64>> [#uses=1] %mask.cast = bitcast i8 %mask to <8 x i1> - %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer ret <4 x i64> %res2 } @@ -5427,7 +5427,7 @@ define <2 x i64> @test_x86_avx512_mask_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1, ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1) %mask.cast = bitcast i8 %mask to <8 x i1> - %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> %a2 ret <2 x i64> %res2 } @@ -5447,7 +5447,7 @@ define <2 x i64> @test_x86_avx512_maskz_psrav_q_128(<2 x i64> %a0, <2 x i64> %a1 ; X64-NEXT: retq # encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.psrav.q.128(<2 x i64> %a0, <2 x i64> %a1) %mask.cast = bitcast i8 %mask to <8 x i1> - %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <2 x i32> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <2 x i32> %res2 = select <2 x i1> %mask.extract, <2 x i64> %res, <2 x i64> zeroinitializer ret <2 x i64> %res2 } @@ -5480,7 +5480,7 @@ define <4 x i64> @test_x86_avx512_mask_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1, ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1) %mask.cast = bitcast i8 %mask to <8 x i1> - %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> %a2 ret <4 x i64> %res2 } @@ -5500,7 +5500,7 @@ define <4 x i64> @test_x86_avx512_maskz_psrav_q_256(<4 x i64> %a0, <4 x i64> %a1 ; X64-NEXT: retq # encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.psrav.q.256(<4 x i64> %a0, <4 x i64> %a1) %mask.cast = bitcast i8 %mask to <8 x i1> - %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> undef, <4 x i32> + %mask.extract = shufflevector <8 x i1> %mask.cast, <8 x i1> poison, <4 x i32> %res2 = select <4 x i1> %mask.extract, <4 x i64> %res, <4 x i64> zeroinitializer ret <4 x i64> %res2 } @@ -6861,7 +6861,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmb(<4 x float> %a0, <4 x float> %a1, ; X64-NEXT: # xmm0 {%k1} = (xmm1 * xmm0) + mem ; X64-NEXT: retq # encoding: [0xc3] %q = load float, ptr %ptr_a2 - %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit.i = insertelement <4 x float> poison, float %q, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 @@ -6889,7 +6889,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmba(<4 x float> %a0, <4 x float> %a1 ; X64-NEXT: # xmm0 {%k1} = (xmm1 * xmm0) + mem ; X64-NEXT: retq # encoding: [0xc3] %q = load float, ptr %ptr_a2, align 4 - %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit.i = insertelement <4 x float> poison, float %q, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 @@ -6914,7 +6914,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbz(<4 x float> %a0, <4 x float> %a1 ; X64-NEXT: # xmm0 = (xmm1 * xmm0) + mem ; X64-NEXT: retq # encoding: [0xc3] %q = load float, ptr %ptr_a2 - %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit.i = insertelement <4 x float> poison, float %q, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 @@ -6936,7 +6936,7 @@ define <4 x float> @test_mask_vfmadd128_ps_rmbza(<4 x float> %a0, <4 x float> %a ; X64-NEXT: # xmm0 = (xmm1 * xmm0) + mem ; X64-NEXT: retq # encoding: [0xc3] %q = load float, ptr %ptr_a2, align 4 - %vecinit.i = insertelement <4 x float> undef, float %q, i32 0 + %vecinit.i = insertelement <4 x float> poison, float %q, i32 0 %vecinit2.i = insertelement <4 x float> %vecinit.i, float %q, i32 1 %vecinit4.i = insertelement <4 x float> %vecinit2.i, float %q, i32 2 %vecinit6.i = insertelement <4 x float> %vecinit4.i, float %q, i32 3 From 28a4ed945dc101c9a7dbdc93d9461da67225f7dc Mon Sep 17 00:00:00 2001 From: Brox Chen Date: Tue, 10 Jun 2025 22:49:09 -0400 Subject: [PATCH 025/851] [AMDGPU][True16] remove AsmVOP3OpSel (#143465) This is NFC. Clean up the AsmVOP3OpSel field, and use Vop3Base instead. --- llvm/lib/Target/AMDGPU/SIInstrInfo.td | 44 +--------------------- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 10 +---- llvm/lib/Target/AMDGPU/VOPInstructions.td | 6 +-- 3 files changed, 5 insertions(+), 55 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index 2c20475726a48..e74ccbee975ab 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2242,41 +2242,6 @@ class getAsmVOP3P { - string dst = "$vdst"; - - string isrc0 = !if(!eq(NumSrcArgs, 1), "$src0", "$src0,"); - string isrc1 = !if(!eq(NumSrcArgs, 1), "", - !if(!eq(NumSrcArgs, 2), " $src1", - " $src1,")); - string isrc2 = !if(!eq(NumSrcArgs, 3), " $src2", ""); - - string fsrc0 = !if(!eq(NumSrcArgs, 1), "$src0_modifiers", "$src0_modifiers,"); - string fsrc1 = !if(!eq(NumSrcArgs, 1), "", - !if(!eq(NumSrcArgs, 2), " $src1_modifiers", - " $src1_modifiers,")); - string fsrc2 = !if(!eq(NumSrcArgs, 3), " $src2_modifiers", ""); - - string src0 = !if(Src0HasMods, fsrc0, isrc0); - string src1 = !if(Src1HasMods, fsrc1, isrc1); - string src2 = !if(Src2HasMods, fsrc2, isrc2); - - string bytesel = !if(HasByteSel, "$byte_sel", ""); - string clamp = !if(HasClamp, "$clamp", ""); - string omod = !if(HasOMod, "$omod", ""); - string bitop3 = !if(HasBitOp3, "$bitop3", ""); - string ret = dst#", "#src0#src1#src2#bitop3#"$op_sel"#bytesel#clamp#omod; -} - class getAsmDPP { string dst = !if(HasDst, !if(!eq(DstVT.Size, 1), @@ -2687,14 +2652,7 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { HasSrc2Mods, DstVT, HasFP8ByteSel, HasBitOp3>.ret; field string Asm64 = AsmVOP3Base; field string AsmVOP3P = getAsmVOP3P.ret; - field string AsmVOP3OpSel = getAsmVOP3OpSel.ret; + field string AsmVOP3OpSel = AsmVOP3Base; field string AsmVOP3DPP = getAsmVOP3DPP.ret; field string AsmVOP3DPP16 = getAsmVOP3DPP16.ret; field string AsmVOP3DPP8 = getAsmVOP3DPP8.ret; diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 9f66951372d19..a005e0245b8ff 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -626,10 +626,6 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile, let HasOpSel = 1; let HasFP8DstByteSel = 1; let HasFP8ByteSel = 0; // It works as a dst-bytesel, but does not have byte_sel operand. - let AsmVOP3OpSel = !subst(", $src2_modifiers", "", - getAsmVOP3OpSel<3, HasClamp, HasOMod, - HasSrc0FloatMods, HasSrc1FloatMods, - HasSrc2FloatMods>.ret); let AsmVOP3Base = !subst(", $src2_modifiers", "", getAsmVOP3Base : VOP3_Profile< let HasSrc2 = 0; let HasSrc2Mods = 1; let HasOpSel = 1; - let AsmVOP3OpSel = !subst(", $src2_modifiers", "", - getAsmVOP3OpSel<3, HasClamp, HasOMod, - HasSrc0FloatMods, HasSrc1FloatMods, - HasSrc2FloatMods>.ret); + let Asm64 = !subst(", $src2_modifiers", "", AsmVOP3Base); let HasExtVOP3DPP = 0; let HasFP8DstByteSel = 1; + let HasFP8ByteSel = 0; } class VOP3_CVT_SCALE_SR_PK_F4_F16BF16_TiedInput_Profile : diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 4cd845aaa5497..6045f59d1f040 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -112,9 +112,7 @@ class VOP3_Pseudo pattern = [], bit HasFP8DstByteSel = P.HasFP8DstByteSel; bit HasFP4DstByteSel = P.HasFP4DstByteSel; - let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel), - P.AsmVOP3OpSel, - !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64)); + let AsmOperands = !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64); let Size = 8; let mayLoad = 0; @@ -1484,7 +1482,7 @@ class VOP3_Profile_Base : VO let HasModifiers = !if (Features.IsMAI, 0, - !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers)); + !or(Features.IsPacked, P.HasModifiers)); } class VOP3_Profile : VOP3_Profile_Base { From d75e28477af0baa063a4d4cc7b3cf657cfadd758 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Tue, 10 Jun 2025 20:36:52 -0700 Subject: [PATCH 026/851] [flang][runtime] Fix build bot flang-runtime-cuda-gcc errors (#143650) Adjust default parent class accessibility to attemp to work around what appear to be old GCC's interpretation. --- flang-rt/include/flang-rt/runtime/work-queue.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h index 878b18373e1d2..f7f4777839836 100644 --- a/flang-rt/include/flang-rt/runtime/work-queue.h +++ b/flang-rt/include/flang-rt/runtime/work-queue.h @@ -319,7 +319,7 @@ class AssignTicket : public ImmediateTicketRunner { template class DerivedAssignTicket : public ImmediateTicketRunner>, - private std::conditional_t { public: using Base = std::conditional_t class DescriptorIoTicket : public ImmediateTicketRunner>, - private Elementwise { + protected Elementwise { public: RT_API_ATTRS DescriptorIoTicket(io::IoStatementState &io, const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table, @@ -372,7 +372,7 @@ class DescriptorIoTicket template class DerivedIoTicket : public ImmediateTicketRunner>, - private ElementsOverComponents { + protected ElementsOverComponents { public: RT_API_ATTRS DerivedIoTicket(io::IoStatementState &io, const Descriptor &descriptor, const typeInfo::DerivedType &derived, From 3ece9b06a2d299d5a108efa856e662587543b2f3 Mon Sep 17 00:00:00 2001 From: quic_hchandel Date: Wed, 11 Jun 2025 09:56:12 +0530 Subject: [PATCH 027/851] [RISCV][NFC] Improve test coverage for xtheadcondmov and xmipscmov (#143567) Co-authored-by: Harsh Chandel --- llvm/test/CodeGen/RISCV/select-cond.ll | 1018 ++++++++++++++++++++++++ 1 file changed, 1018 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/select-cond.ll diff --git a/llvm/test/CodeGen/RISCV/select-cond.ll b/llvm/test/CodeGen/RISCV/select-cond.ll new file mode 100644 index 0000000000000..a5f4677f73f13 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/select-cond.ll @@ -0,0 +1,1018 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=RV32 +; RUN: llc -mtriple=riscv32 -mattr=+xtheadcondmov -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=RV32-THEAD +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=RV64 +; RUN: llc -mtriple=riscv64 -mattr=+xmipscmov -verify-machineinstrs < %s \ +; RUN: | FileCheck %s --check-prefixes=RV64-MIPS + +define signext i32 @select_i32_trunc(i32 signext %cond, i32 signext %x, i32 signext %y) nounwind { +; RV32-LABEL: select_i32_trunc: +; RV32: # %bb.0: +; RV32-NEXT: andi a3, a0, 1 +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: bnez a3, .LBB0_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: .LBB0_2: +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i32_trunc: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: andi a0, a0, 1 +; RV32-THEAD-NEXT: th.mveqz a1, a2, a0 +; RV32-THEAD-NEXT: mv a0, a1 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i32_trunc: +; RV64: # %bb.0: +; RV64-NEXT: andi a3, a0, 1 +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: bnez a3, .LBB0_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: .LBB0_2: +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i32_trunc: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: andi a0, a0, 1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a1, a2 +; RV64-MIPS-NEXT: ret + %cond_trunc = trunc i32 %cond to i1 + %res = select i1 %cond_trunc, i32 %x, i32 %y + ret i32 %res +} + +define signext i32 @select_i32_param(i1 signext %cond, i32 signext %x, i32 signext %y) nounwind { +; RV32-LABEL: select_i32_param: +; RV32: # %bb.0: +; RV32-NEXT: andi a3, a0, 1 +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: bnez a3, .LBB1_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: .LBB1_2: +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i32_param: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: andi a0, a0, 1 +; RV32-THEAD-NEXT: th.mveqz a1, a2, a0 +; RV32-THEAD-NEXT: mv a0, a1 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i32_param: +; RV64: # %bb.0: +; RV64-NEXT: andi a3, a0, 1 +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: bnez a3, .LBB1_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: .LBB1_2: +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i32_param: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: andi a0, a0, 1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a1, a2 +; RV64-MIPS-NEXT: ret + %res = select i1 %cond, i32 %x, i32 %y + ret i32 %res +} + +define signext i32 @select_i32_eq(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind { +; RV32-LABEL: select_i32_eq: +; RV32: # %bb.0: +; RV32-NEXT: beq a0, a1, .LBB2_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB2_2: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i32_eq: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: xor a0, a0, a1 +; RV32-THEAD-NEXT: th.mvnez a2, a3, a0 +; RV32-THEAD-NEXT: mv a0, a2 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i32_eq: +; RV64: # %bb.0: +; RV64-NEXT: beq a0, a1, .LBB2_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB2_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i32_eq: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: xor a0, a0, a1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a3, a2 +; RV64-MIPS-NEXT: ret + %cond = icmp eq i32 %a, %b + %res = select i1 %cond, i32 %x, i32 %y + ret i32 %res +} + +define signext i32 @select_i32_ne(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind { +; RV32-LABEL: select_i32_ne: +; RV32: # %bb.0: +; RV32-NEXT: bne a0, a1, .LBB3_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB3_2: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i32_ne: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: xor a0, a0, a1 +; RV32-THEAD-NEXT: th.mveqz a2, a3, a0 +; RV32-THEAD-NEXT: mv a0, a2 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i32_ne: +; RV64: # %bb.0: +; RV64-NEXT: bne a0, a1, .LBB3_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB3_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i32_ne: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: xor a0, a0, a1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a2, a3 +; RV64-MIPS-NEXT: ret + %cond = icmp ne i32 %a, %b + %res = select i1 %cond, i32 %x, i32 %y + ret i32 %res +} + +define signext i32 @select_i32_ugt(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind { +; RV32-LABEL: select_i32_ugt: +; RV32: # %bb.0: +; RV32-NEXT: bltu a1, a0, .LBB4_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB4_2: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i32_ugt: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: sltu a0, a1, a0 +; RV32-THEAD-NEXT: th.mveqz a2, a3, a0 +; RV32-THEAD-NEXT: mv a0, a2 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i32_ugt: +; RV64: # %bb.0: +; RV64-NEXT: bltu a1, a0, .LBB4_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB4_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i32_ugt: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: sltu a0, a1, a0 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a2, a3 +; RV64-MIPS-NEXT: ret + %cond = icmp ugt i32 %a, %b + %res = select i1 %cond, i32 %x, i32 %y + ret i32 %res +} + +define signext i32 @select_i32_uge(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind { +; RV32-LABEL: select_i32_uge: +; RV32: # %bb.0: +; RV32-NEXT: bgeu a0, a1, .LBB5_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB5_2: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i32_uge: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: sltu a0, a0, a1 +; RV32-THEAD-NEXT: th.mvnez a2, a3, a0 +; RV32-THEAD-NEXT: mv a0, a2 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i32_uge: +; RV64: # %bb.0: +; RV64-NEXT: bgeu a0, a1, .LBB5_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB5_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i32_uge: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: sltu a0, a0, a1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a3, a2 +; RV64-MIPS-NEXT: ret + %cond = icmp uge i32 %a, %b + %res = select i1 %cond, i32 %x, i32 %y + ret i32 %res +} + +define signext i32 @select_i32_ult(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind { +; RV32-LABEL: select_i32_ult: +; RV32: # %bb.0: +; RV32-NEXT: bltu a0, a1, .LBB6_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB6_2: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i32_ult: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: sltu a0, a0, a1 +; RV32-THEAD-NEXT: th.mveqz a2, a3, a0 +; RV32-THEAD-NEXT: mv a0, a2 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i32_ult: +; RV64: # %bb.0: +; RV64-NEXT: bltu a0, a1, .LBB6_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB6_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i32_ult: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: sltu a0, a0, a1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a2, a3 +; RV64-MIPS-NEXT: ret + %cond = icmp ult i32 %a, %b + %res = select i1 %cond, i32 %x, i32 %y + ret i32 %res +} + +define signext i32 @select_i32_ule(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind { +; RV32-LABEL: select_i32_ule: +; RV32: # %bb.0: +; RV32-NEXT: bgeu a1, a0, .LBB7_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB7_2: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i32_ule: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: sltu a0, a1, a0 +; RV32-THEAD-NEXT: th.mvnez a2, a3, a0 +; RV32-THEAD-NEXT: mv a0, a2 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i32_ule: +; RV64: # %bb.0: +; RV64-NEXT: bgeu a1, a0, .LBB7_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB7_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i32_ule: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: sltu a0, a1, a0 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a3, a2 +; RV64-MIPS-NEXT: ret + %cond = icmp ule i32 %a, %b + %res = select i1 %cond, i32 %x, i32 %y + ret i32 %res +} + +define signext i32 @select_i32_sgt(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind { +; RV32-LABEL: select_i32_sgt: +; RV32: # %bb.0: +; RV32-NEXT: blt a1, a0, .LBB8_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB8_2: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i32_sgt: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: slt a0, a1, a0 +; RV32-THEAD-NEXT: th.mveqz a2, a3, a0 +; RV32-THEAD-NEXT: mv a0, a2 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i32_sgt: +; RV64: # %bb.0: +; RV64-NEXT: blt a1, a0, .LBB8_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB8_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i32_sgt: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: slt a0, a1, a0 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a2, a3 +; RV64-MIPS-NEXT: ret + %cond = icmp sgt i32 %a, %b + %res = select i1 %cond, i32 %x, i32 %y + ret i32 %res +} + +define signext i32 @select_i32_sge(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind { +; RV32-LABEL: select_i32_sge: +; RV32: # %bb.0: +; RV32-NEXT: bge a0, a1, .LBB9_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB9_2: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i32_sge: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: slt a0, a0, a1 +; RV32-THEAD-NEXT: th.mvnez a2, a3, a0 +; RV32-THEAD-NEXT: mv a0, a2 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i32_sge: +; RV64: # %bb.0: +; RV64-NEXT: bge a0, a1, .LBB9_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB9_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i32_sge: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: slt a0, a0, a1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a3, a2 +; RV64-MIPS-NEXT: ret + %cond = icmp sge i32 %a, %b + %res = select i1 %cond, i32 %x, i32 %y + ret i32 %res +} + +define signext i32 @select_i32_slt(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind { +; RV32-LABEL: select_i32_slt: +; RV32: # %bb.0: +; RV32-NEXT: blt a0, a1, .LBB10_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB10_2: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i32_slt: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: slt a0, a0, a1 +; RV32-THEAD-NEXT: th.mveqz a2, a3, a0 +; RV32-THEAD-NEXT: mv a0, a2 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i32_slt: +; RV64: # %bb.0: +; RV64-NEXT: blt a0, a1, .LBB10_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB10_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i32_slt: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: slt a0, a0, a1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a2, a3 +; RV64-MIPS-NEXT: ret + %cond = icmp slt i32 %a, %b + %res = select i1 %cond, i32 %x, i32 %y + ret i32 %res +} + +define signext i32 @select_i32_sle(i32 signext %a, i32 signext %b, i32 signext %x, i32 signext %y) nounwind { +; RV32-LABEL: select_i32_sle: +; RV32: # %bb.0: +; RV32-NEXT: bge a1, a0, .LBB11_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a2, a3 +; RV32-NEXT: .LBB11_2: +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i32_sle: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: slt a0, a1, a0 +; RV32-THEAD-NEXT: th.mvnez a2, a3, a0 +; RV32-THEAD-NEXT: mv a0, a2 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i32_sle: +; RV64: # %bb.0: +; RV64-NEXT: bge a1, a0, .LBB11_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB11_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i32_sle: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: slt a0, a1, a0 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a3, a2 +; RV64-MIPS-NEXT: ret + %cond = icmp sle i32 %a, %b + %res = select i1 %cond, i32 %x, i32 %y + ret i32 %res +} + +define i64 @select_i64_trunc(i64 %cond, i64 %x, i64 %y) nounwind { +; RV32-LABEL: select_i64_trunc: +; RV32: # %bb.0: +; RV32-NEXT: mv a1, a3 +; RV32-NEXT: andi a3, a0, 1 +; RV32-NEXT: mv a0, a2 +; RV32-NEXT: bnez a3, .LBB12_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a1, a5 +; RV32-NEXT: .LBB12_2: +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i64_trunc: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: mv a1, a3 +; RV32-THEAD-NEXT: andi a0, a0, 1 +; RV32-THEAD-NEXT: th.mveqz a2, a4, a0 +; RV32-THEAD-NEXT: th.mveqz a1, a5, a0 +; RV32-THEAD-NEXT: mv a0, a2 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i64_trunc: +; RV64: # %bb.0: +; RV64-NEXT: andi a3, a0, 1 +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: bnez a3, .LBB12_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: .LBB12_2: +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i64_trunc: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: andi a0, a0, 1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a1, a2 +; RV64-MIPS-NEXT: ret + %cond_trunc = trunc i64 %cond to i1 + %res = select i1 %cond_trunc, i64 %x, i64 %y + ret i64 %res +} + +define i64 @select_i64_param(i1 %cond, i64 %x, i64 %y) nounwind { +; RV32-LABEL: select_i64_param: +; RV32: # %bb.0: +; RV32-NEXT: andi a5, a0, 1 +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: bnez a5, .LBB13_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a3 +; RV32-NEXT: mv a2, a4 +; RV32-NEXT: .LBB13_2: +; RV32-NEXT: mv a1, a2 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i64_param: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: andi a0, a0, 1 +; RV32-THEAD-NEXT: th.mveqz a1, a3, a0 +; RV32-THEAD-NEXT: th.mveqz a2, a4, a0 +; RV32-THEAD-NEXT: mv a0, a1 +; RV32-THEAD-NEXT: mv a1, a2 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i64_param: +; RV64: # %bb.0: +; RV64-NEXT: andi a3, a0, 1 +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: bnez a3, .LBB13_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: .LBB13_2: +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i64_param: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: andi a0, a0, 1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a1, a2 +; RV64-MIPS-NEXT: ret + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +define i64 @select_i64_eq(i64 %a, i64 %b, i64 %x, i64 %y) nounwind { +; RV32-LABEL: select_i64_eq: +; RV32: # %bb.0: +; RV32-NEXT: xor a1, a1, a3 +; RV32-NEXT: xor a0, a0, a2 +; RV32-NEXT: or a1, a0, a1 +; RV32-NEXT: mv a0, a4 +; RV32-NEXT: beqz a1, .LBB14_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a6 +; RV32-NEXT: mv a5, a7 +; RV32-NEXT: .LBB14_2: +; RV32-NEXT: mv a1, a5 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i64_eq: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: xor a1, a1, a3 +; RV32-THEAD-NEXT: xor a0, a0, a2 +; RV32-THEAD-NEXT: or a0, a0, a1 +; RV32-THEAD-NEXT: th.mvnez a4, a6, a0 +; RV32-THEAD-NEXT: th.mvnez a5, a7, a0 +; RV32-THEAD-NEXT: mv a0, a4 +; RV32-THEAD-NEXT: mv a1, a5 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i64_eq: +; RV64: # %bb.0: +; RV64-NEXT: beq a0, a1, .LBB14_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB14_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i64_eq: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: xor a0, a0, a1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a3, a2 +; RV64-MIPS-NEXT: ret + %cond = icmp eq i64 %a, %b + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +define i64 @select_i64_ne(i64 %a, i64 %b, i64 %x, i64 %y) nounwind { +; RV32-LABEL: select_i64_ne: +; RV32: # %bb.0: +; RV32-NEXT: xor a1, a1, a3 +; RV32-NEXT: xor a0, a0, a2 +; RV32-NEXT: or a1, a0, a1 +; RV32-NEXT: mv a0, a4 +; RV32-NEXT: bnez a1, .LBB15_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a6 +; RV32-NEXT: mv a5, a7 +; RV32-NEXT: .LBB15_2: +; RV32-NEXT: mv a1, a5 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i64_ne: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: xor a1, a1, a3 +; RV32-THEAD-NEXT: xor a0, a0, a2 +; RV32-THEAD-NEXT: or a0, a0, a1 +; RV32-THEAD-NEXT: th.mveqz a4, a6, a0 +; RV32-THEAD-NEXT: th.mveqz a5, a7, a0 +; RV32-THEAD-NEXT: mv a0, a4 +; RV32-THEAD-NEXT: mv a1, a5 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i64_ne: +; RV64: # %bb.0: +; RV64-NEXT: bne a0, a1, .LBB15_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB15_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i64_ne: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: xor a0, a0, a1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a2, a3 +; RV64-MIPS-NEXT: ret + %cond = icmp ne i64 %a, %b + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +define i64 @select_i64_ugt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind { +; RV32-LABEL: select_i64_ugt: +; RV32: # %bb.0: +; RV32-NEXT: beq a1, a3, .LBB16_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: sltu a0, a3, a1 +; RV32-NEXT: beqz a0, .LBB16_3 +; RV32-NEXT: j .LBB16_4 +; RV32-NEXT: .LBB16_2: +; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: bnez a0, .LBB16_4 +; RV32-NEXT: .LBB16_3: +; RV32-NEXT: mv a4, a6 +; RV32-NEXT: mv a5, a7 +; RV32-NEXT: .LBB16_4: +; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a1, a5 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i64_ugt: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: xor t0, a1, a3 +; RV32-THEAD-NEXT: sltu a1, a3, a1 +; RV32-THEAD-NEXT: sltu a0, a2, a0 +; RV32-THEAD-NEXT: th.mvnez a0, a1, t0 +; RV32-THEAD-NEXT: th.mveqz a4, a6, a0 +; RV32-THEAD-NEXT: th.mveqz a5, a7, a0 +; RV32-THEAD-NEXT: mv a0, a4 +; RV32-THEAD-NEXT: mv a1, a5 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i64_ugt: +; RV64: # %bb.0: +; RV64-NEXT: bltu a1, a0, .LBB16_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB16_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i64_ugt: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: sltu a0, a1, a0 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a2, a3 +; RV64-MIPS-NEXT: ret + %cond = icmp ugt i64 %a, %b + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +define i64 @select_i64_uge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind { +; RV32-LABEL: select_i64_uge: +; RV32: # %bb.0: +; RV32-NEXT: beq a1, a3, .LBB17_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: sltu a0, a1, a3 +; RV32-NEXT: bnez a0, .LBB17_3 +; RV32-NEXT: j .LBB17_4 +; RV32-NEXT: .LBB17_2: +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: beqz a0, .LBB17_4 +; RV32-NEXT: .LBB17_3: +; RV32-NEXT: mv a4, a6 +; RV32-NEXT: mv a5, a7 +; RV32-NEXT: .LBB17_4: +; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a1, a5 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i64_uge: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: xor t0, a1, a3 +; RV32-THEAD-NEXT: sltu a1, a1, a3 +; RV32-THEAD-NEXT: sltu a0, a0, a2 +; RV32-THEAD-NEXT: th.mvnez a0, a1, t0 +; RV32-THEAD-NEXT: th.mvnez a4, a6, a0 +; RV32-THEAD-NEXT: th.mvnez a5, a7, a0 +; RV32-THEAD-NEXT: mv a0, a4 +; RV32-THEAD-NEXT: mv a1, a5 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i64_uge: +; RV64: # %bb.0: +; RV64-NEXT: bgeu a0, a1, .LBB17_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB17_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i64_uge: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: sltu a0, a0, a1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a3, a2 +; RV64-MIPS-NEXT: ret + %cond = icmp uge i64 %a, %b + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +define i64 @select_i64_ult(i64 %a, i64 %b, i64 %x, i64 %y) nounwind { +; RV32-LABEL: select_i64_ult: +; RV32: # %bb.0: +; RV32-NEXT: beq a1, a3, .LBB18_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: sltu a0, a1, a3 +; RV32-NEXT: beqz a0, .LBB18_3 +; RV32-NEXT: j .LBB18_4 +; RV32-NEXT: .LBB18_2: +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: bnez a0, .LBB18_4 +; RV32-NEXT: .LBB18_3: +; RV32-NEXT: mv a4, a6 +; RV32-NEXT: mv a5, a7 +; RV32-NEXT: .LBB18_4: +; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a1, a5 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i64_ult: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: xor t0, a1, a3 +; RV32-THEAD-NEXT: sltu a1, a1, a3 +; RV32-THEAD-NEXT: sltu a0, a0, a2 +; RV32-THEAD-NEXT: th.mvnez a0, a1, t0 +; RV32-THEAD-NEXT: th.mveqz a4, a6, a0 +; RV32-THEAD-NEXT: th.mveqz a5, a7, a0 +; RV32-THEAD-NEXT: mv a0, a4 +; RV32-THEAD-NEXT: mv a1, a5 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i64_ult: +; RV64: # %bb.0: +; RV64-NEXT: bltu a0, a1, .LBB18_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB18_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i64_ult: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: sltu a0, a0, a1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a2, a3 +; RV64-MIPS-NEXT: ret + %cond = icmp ult i64 %a, %b + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +define i64 @select_i64_ule(i64 %a, i64 %b, i64 %x, i64 %y) nounwind { +; RV32-LABEL: select_i64_ule: +; RV32: # %bb.0: +; RV32-NEXT: beq a1, a3, .LBB19_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: sltu a0, a3, a1 +; RV32-NEXT: bnez a0, .LBB19_3 +; RV32-NEXT: j .LBB19_4 +; RV32-NEXT: .LBB19_2: +; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: beqz a0, .LBB19_4 +; RV32-NEXT: .LBB19_3: +; RV32-NEXT: mv a4, a6 +; RV32-NEXT: mv a5, a7 +; RV32-NEXT: .LBB19_4: +; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a1, a5 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i64_ule: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: xor t0, a1, a3 +; RV32-THEAD-NEXT: sltu a1, a3, a1 +; RV32-THEAD-NEXT: sltu a0, a2, a0 +; RV32-THEAD-NEXT: th.mvnez a0, a1, t0 +; RV32-THEAD-NEXT: th.mvnez a4, a6, a0 +; RV32-THEAD-NEXT: th.mvnez a5, a7, a0 +; RV32-THEAD-NEXT: mv a0, a4 +; RV32-THEAD-NEXT: mv a1, a5 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i64_ule: +; RV64: # %bb.0: +; RV64-NEXT: bgeu a1, a0, .LBB19_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB19_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i64_ule: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: sltu a0, a1, a0 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a3, a2 +; RV64-MIPS-NEXT: ret + %cond = icmp ule i64 %a, %b + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +define i64 @select_i64_sgt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind { +; RV32-LABEL: select_i64_sgt: +; RV32: # %bb.0: +; RV32-NEXT: beq a1, a3, .LBB20_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: slt a0, a3, a1 +; RV32-NEXT: beqz a0, .LBB20_3 +; RV32-NEXT: j .LBB20_4 +; RV32-NEXT: .LBB20_2: +; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: bnez a0, .LBB20_4 +; RV32-NEXT: .LBB20_3: +; RV32-NEXT: mv a4, a6 +; RV32-NEXT: mv a5, a7 +; RV32-NEXT: .LBB20_4: +; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a1, a5 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i64_sgt: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: xor t0, a1, a3 +; RV32-THEAD-NEXT: slt a1, a3, a1 +; RV32-THEAD-NEXT: sltu a0, a2, a0 +; RV32-THEAD-NEXT: th.mvnez a0, a1, t0 +; RV32-THEAD-NEXT: th.mveqz a4, a6, a0 +; RV32-THEAD-NEXT: th.mveqz a5, a7, a0 +; RV32-THEAD-NEXT: mv a0, a4 +; RV32-THEAD-NEXT: mv a1, a5 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i64_sgt: +; RV64: # %bb.0: +; RV64-NEXT: blt a1, a0, .LBB20_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB20_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i64_sgt: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: slt a0, a1, a0 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a2, a3 +; RV64-MIPS-NEXT: ret + %cond = icmp sgt i64 %a, %b + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +define i64 @select_i64_sge(i64 %a, i64 %b, i64 %x, i64 %y) nounwind { +; RV32-LABEL: select_i64_sge: +; RV32: # %bb.0: +; RV32-NEXT: beq a1, a3, .LBB21_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: slt a0, a1, a3 +; RV32-NEXT: bnez a0, .LBB21_3 +; RV32-NEXT: j .LBB21_4 +; RV32-NEXT: .LBB21_2: +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: beqz a0, .LBB21_4 +; RV32-NEXT: .LBB21_3: +; RV32-NEXT: mv a4, a6 +; RV32-NEXT: mv a5, a7 +; RV32-NEXT: .LBB21_4: +; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a1, a5 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i64_sge: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: xor t0, a1, a3 +; RV32-THEAD-NEXT: slt a1, a1, a3 +; RV32-THEAD-NEXT: sltu a0, a0, a2 +; RV32-THEAD-NEXT: th.mvnez a0, a1, t0 +; RV32-THEAD-NEXT: th.mvnez a4, a6, a0 +; RV32-THEAD-NEXT: th.mvnez a5, a7, a0 +; RV32-THEAD-NEXT: mv a0, a4 +; RV32-THEAD-NEXT: mv a1, a5 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i64_sge: +; RV64: # %bb.0: +; RV64-NEXT: bge a0, a1, .LBB21_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB21_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i64_sge: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: slt a0, a0, a1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a3, a2 +; RV64-MIPS-NEXT: ret + %cond = icmp sge i64 %a, %b + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +define i64 @select_i64_slt(i64 %a, i64 %b, i64 %x, i64 %y) nounwind { +; RV32-LABEL: select_i64_slt: +; RV32: # %bb.0: +; RV32-NEXT: beq a1, a3, .LBB22_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: slt a0, a1, a3 +; RV32-NEXT: beqz a0, .LBB22_3 +; RV32-NEXT: j .LBB22_4 +; RV32-NEXT: .LBB22_2: +; RV32-NEXT: sltu a0, a0, a2 +; RV32-NEXT: bnez a0, .LBB22_4 +; RV32-NEXT: .LBB22_3: +; RV32-NEXT: mv a4, a6 +; RV32-NEXT: mv a5, a7 +; RV32-NEXT: .LBB22_4: +; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a1, a5 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i64_slt: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: xor t0, a1, a3 +; RV32-THEAD-NEXT: slt a1, a1, a3 +; RV32-THEAD-NEXT: sltu a0, a0, a2 +; RV32-THEAD-NEXT: th.mvnez a0, a1, t0 +; RV32-THEAD-NEXT: th.mveqz a4, a6, a0 +; RV32-THEAD-NEXT: th.mveqz a5, a7, a0 +; RV32-THEAD-NEXT: mv a0, a4 +; RV32-THEAD-NEXT: mv a1, a5 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i64_slt: +; RV64: # %bb.0: +; RV64-NEXT: blt a0, a1, .LBB22_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB22_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i64_slt: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: slt a0, a0, a1 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a2, a3 +; RV64-MIPS-NEXT: ret + %cond = icmp slt i64 %a, %b + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + +define i64 @select_i64_sle(i64 %a, i64 %b, i64 %x, i64 %y) nounwind { +; RV32-LABEL: select_i64_sle: +; RV32: # %bb.0: +; RV32-NEXT: beq a1, a3, .LBB23_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: slt a0, a3, a1 +; RV32-NEXT: bnez a0, .LBB23_3 +; RV32-NEXT: j .LBB23_4 +; RV32-NEXT: .LBB23_2: +; RV32-NEXT: sltu a0, a2, a0 +; RV32-NEXT: beqz a0, .LBB23_4 +; RV32-NEXT: .LBB23_3: +; RV32-NEXT: mv a4, a6 +; RV32-NEXT: mv a5, a7 +; RV32-NEXT: .LBB23_4: +; RV32-NEXT: mv a0, a4 +; RV32-NEXT: mv a1, a5 +; RV32-NEXT: ret +; +; RV32-THEAD-LABEL: select_i64_sle: +; RV32-THEAD: # %bb.0: +; RV32-THEAD-NEXT: xor t0, a1, a3 +; RV32-THEAD-NEXT: slt a1, a3, a1 +; RV32-THEAD-NEXT: sltu a0, a2, a0 +; RV32-THEAD-NEXT: th.mvnez a0, a1, t0 +; RV32-THEAD-NEXT: th.mvnez a4, a6, a0 +; RV32-THEAD-NEXT: th.mvnez a5, a7, a0 +; RV32-THEAD-NEXT: mv a0, a4 +; RV32-THEAD-NEXT: mv a1, a5 +; RV32-THEAD-NEXT: ret +; +; RV64-LABEL: select_i64_sle: +; RV64: # %bb.0: +; RV64-NEXT: bge a1, a0, .LBB23_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a2, a3 +; RV64-NEXT: .LBB23_2: +; RV64-NEXT: mv a0, a2 +; RV64-NEXT: ret +; +; RV64-MIPS-LABEL: select_i64_sle: +; RV64-MIPS: # %bb.0: +; RV64-MIPS-NEXT: slt a0, a1, a0 +; RV64-MIPS-NEXT: mips.ccmov a0, a0, a3, a2 +; RV64-MIPS-NEXT: ret + %cond = icmp sle i64 %a, %b + %res = select i1 %cond, i64 %x, i64 %y + ret i64 %res +} + From a3201ce9e114aa2ecd66e525607093e4dff2f574 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?= =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?= =?UTF-8?q?=E3=83=B3=29?= Date: Tue, 10 Jun 2025 22:10:26 -0700 Subject: [PATCH 028/851] [flang][cuda] Add option to disable warp function in semantic (#143640) These functions are not available in some lower compute capabilities. Add option in the language feature to enforce the semantic check on these. --- .../include/flang/Support/Fortran-features.h | 2 +- flang/lib/Semantics/check-cuda.cpp | 125 ++++++++++++------ flang/test/Semantics/cuf22.cuf | 8 ++ flang/tools/bbc/bbc.cpp | 10 ++ 4 files changed, 101 insertions(+), 44 deletions(-) create mode 100644 flang/test/Semantics/cuf22.cuf diff --git a/flang/include/flang/Support/Fortran-features.h b/flang/include/flang/Support/Fortran-features.h index 3f6d825e2b66c..ea0845b7d605f 100644 --- a/flang/include/flang/Support/Fortran-features.h +++ b/flang/include/flang/Support/Fortran-features.h @@ -55,7 +55,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines, SavedLocalInSpecExpr, PrintNamelist, AssumedRankPassedToNonAssumedRank, IgnoreIrrelevantAttributes, Unsigned, AmbiguousStructureConstructor, ContiguousOkForSeqAssociation, ForwardRefExplicitTypeDummy, - InaccessibleDeferredOverride) + InaccessibleDeferredOverride, CudaWarpMatchFunction) // Portability and suspicious usage warnings ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable, diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp index c024640af1220..8decfb0149829 100644 --- a/flang/lib/Semantics/check-cuda.cpp +++ b/flang/lib/Semantics/check-cuda.cpp @@ -17,6 +17,7 @@ #include "flang/Semantics/expression.h" #include "flang/Semantics/symbol.h" #include "flang/Semantics/tools.h" +#include "llvm/ADT/StringSet.h" // Once labeled DO constructs have been canonicalized and their parse subtrees // transformed into parser::DoConstructs, scan the parser::Blocks of the program @@ -61,6 +62,11 @@ bool CanonicalizeCUDA(parser::Program &program) { using MaybeMsg = std::optional; +static const llvm::StringSet<> warpFunctions_ = {"match_all_syncjj", + "match_all_syncjx", "match_all_syncjf", "match_all_syncjd", + "match_any_syncjj", "match_any_syncjx", "match_any_syncjf", + "match_any_syncjd"}; + // Traverses an evaluate::Expr<> in search of unsupported operations // on the device. @@ -68,7 +74,7 @@ struct DeviceExprChecker : public evaluate::AnyTraverse { using Result = MaybeMsg; using Base = evaluate::AnyTraverse; - DeviceExprChecker() : Base(*this) {} + explicit DeviceExprChecker(SemanticsContext &c) : Base(*this), context_{c} {} using Base::operator(); Result operator()(const evaluate::ProcedureDesignator &x) const { if (const Symbol * sym{x.GetInterfaceSymbol()}) { @@ -78,10 +84,17 @@ struct DeviceExprChecker if (auto attrs{subp->cudaSubprogramAttrs()}) { if (*attrs == common::CUDASubprogramAttrs::HostDevice || *attrs == common::CUDASubprogramAttrs::Device) { + if (warpFunctions_.contains(sym->name().ToString()) && + !context_.languageFeatures().IsEnabled( + Fortran::common::LanguageFeature::CudaWarpMatchFunction)) { + return parser::MessageFormattedText( + "warp match function disabled"_err_en_US); + } return {}; } } } + const Symbol &ultimate{sym->GetUltimate()}; const Scope &scope{ultimate.owner()}; const Symbol *mod{scope.IsModule() ? scope.symbol() : nullptr}; @@ -94,9 +107,12 @@ struct DeviceExprChecker // TODO(CUDA): Check for unsupported intrinsics here return {}; } + return parser::MessageFormattedText( "'%s' may not be called in device code"_err_en_US, x.GetName()); } + + SemanticsContext &context_; }; struct FindHostArray @@ -133,9 +149,10 @@ struct FindHostArray } }; -template static MaybeMsg CheckUnwrappedExpr(const A &x) { +template +static MaybeMsg CheckUnwrappedExpr(SemanticsContext &context, const A &x) { if (const auto *expr{parser::Unwrap(x)}) { - return DeviceExprChecker{}(expr->typedExpr); + return DeviceExprChecker{context}(expr->typedExpr); } return {}; } @@ -144,104 +161,124 @@ template static void CheckUnwrappedExpr( SemanticsContext &context, SourceName at, const A &x) { if (const auto *expr{parser::Unwrap(x)}) { - if (auto msg{DeviceExprChecker{}(expr->typedExpr)}) { + if (auto msg{DeviceExprChecker{context}(expr->typedExpr)}) { context.Say(at, std::move(*msg)); } } } template struct ActionStmtChecker { - template static MaybeMsg WhyNotOk(const A &x) { + template + static MaybeMsg WhyNotOk(SemanticsContext &context, const A &x) { if constexpr (ConstraintTrait) { - return WhyNotOk(x.thing); + return WhyNotOk(context, x.thing); } else if constexpr (WrapperTrait) { - return WhyNotOk(x.v); + return WhyNotOk(context, x.v); } else if constexpr (UnionTrait) { - return WhyNotOk(x.u); + return WhyNotOk(context, x.u); } else if constexpr (TupleTrait) { - return WhyNotOk(x.t); + return WhyNotOk(context, x.t); } else { return parser::MessageFormattedText{ "Statement may not appear in device code"_err_en_US}; } } template - static MaybeMsg WhyNotOk(const common::Indirection &x) { - return WhyNotOk(x.value()); + static MaybeMsg WhyNotOk( + SemanticsContext &context, const common::Indirection &x) { + return WhyNotOk(context, x.value()); } template - static MaybeMsg WhyNotOk(const std::variant &x) { - return common::visit([](const auto &x) { return WhyNotOk(x); }, x); + static MaybeMsg WhyNotOk( + SemanticsContext &context, const std::variant &x) { + return common::visit( + [&context](const auto &x) { return WhyNotOk(context, x); }, x); } template - static MaybeMsg WhyNotOk(const std::tuple &x) { + static MaybeMsg WhyNotOk( + SemanticsContext &context, const std::tuple &x) { if constexpr (J == sizeof...(As)) { return {}; - } else if (auto msg{WhyNotOk(std::get(x))}) { + } else if (auto msg{WhyNotOk(context, std::get(x))}) { return msg; } else { - return WhyNotOk<(J + 1)>(x); + return WhyNotOk<(J + 1)>(context, x); } } - template static MaybeMsg WhyNotOk(const std::list &x) { + template + static MaybeMsg WhyNotOk(SemanticsContext &context, const std::list &x) { for (const auto &y : x) { - if (MaybeMsg result{WhyNotOk(y)}) { + if (MaybeMsg result{WhyNotOk(context, y)}) { return result; } } return {}; } - template static MaybeMsg WhyNotOk(const std::optional &x) { + template + static MaybeMsg WhyNotOk( + SemanticsContext &context, const std::optional &x) { if (x) { - return WhyNotOk(*x); + return WhyNotOk(context, *x); } else { return {}; } } template - static MaybeMsg WhyNotOk(const parser::UnlabeledStatement &x) { - return WhyNotOk(x.statement); + static MaybeMsg WhyNotOk( + SemanticsContext &context, const parser::UnlabeledStatement &x) { + return WhyNotOk(context, x.statement); } template - static MaybeMsg WhyNotOk(const parser::Statement &x) { - return WhyNotOk(x.statement); + static MaybeMsg WhyNotOk( + SemanticsContext &context, const parser::Statement &x) { + return WhyNotOk(context, x.statement); } - static MaybeMsg WhyNotOk(const parser::AllocateStmt &) { + static MaybeMsg WhyNotOk( + SemanticsContext &context, const parser::AllocateStmt &) { return {}; // AllocateObjects are checked elsewhere } - static MaybeMsg WhyNotOk(const parser::AllocateCoarraySpec &) { + static MaybeMsg WhyNotOk( + SemanticsContext &context, const parser::AllocateCoarraySpec &) { return parser::MessageFormattedText( "A coarray may not be allocated on the device"_err_en_US); } - static MaybeMsg WhyNotOk(const parser::DeallocateStmt &) { + static MaybeMsg WhyNotOk( + SemanticsContext &context, const parser::DeallocateStmt &) { return {}; // AllocateObjects are checked elsewhere } - static MaybeMsg WhyNotOk(const parser::AssignmentStmt &x) { - return DeviceExprChecker{}(x.typedAssignment); + static MaybeMsg WhyNotOk( + SemanticsContext &context, const parser::AssignmentStmt &x) { + return DeviceExprChecker{context}(x.typedAssignment); } - static MaybeMsg WhyNotOk(const parser::CallStmt &x) { - return DeviceExprChecker{}(x.typedCall); + static MaybeMsg WhyNotOk( + SemanticsContext &context, const parser::CallStmt &x) { + return DeviceExprChecker{context}(x.typedCall); + } + static MaybeMsg WhyNotOk( + SemanticsContext &context, const parser::ContinueStmt &) { + return {}; } - static MaybeMsg WhyNotOk(const parser::ContinueStmt &) { return {}; } - static MaybeMsg WhyNotOk(const parser::IfStmt &x) { - if (auto result{ - CheckUnwrappedExpr(std::get(x.t))}) { + static MaybeMsg WhyNotOk(SemanticsContext &context, const parser::IfStmt &x) { + if (auto result{CheckUnwrappedExpr( + context, std::get(x.t))}) { return result; } - return WhyNotOk( + return WhyNotOk(context, std::get>(x.t) .statement); } - static MaybeMsg WhyNotOk(const parser::NullifyStmt &x) { + static MaybeMsg WhyNotOk( + SemanticsContext &context, const parser::NullifyStmt &x) { for (const auto &y : x.v) { - if (MaybeMsg result{DeviceExprChecker{}(y.typedExpr)}) { + if (MaybeMsg result{DeviceExprChecker{context}(y.typedExpr)}) { return result; } } return {}; } - static MaybeMsg WhyNotOk(const parser::PointerAssignmentStmt &x) { - return DeviceExprChecker{}(x.typedAssignment); + static MaybeMsg WhyNotOk( + SemanticsContext &context, const parser::PointerAssignmentStmt &x) { + return DeviceExprChecker{context}(x.typedAssignment); } }; @@ -435,12 +472,14 @@ template class DeviceContextChecker { ErrorIfHostSymbol(assign->lhs, source); ErrorIfHostSymbol(assign->rhs, source); } - if (auto msg{ActionStmtChecker::WhyNotOk(x)}) { + if (auto msg{ActionStmtChecker::WhyNotOk( + context_, x)}) { context_.Say(source, std::move(*msg)); } }, [&](const auto &x) { - if (auto msg{ActionStmtChecker::WhyNotOk(x)}) { + if (auto msg{ActionStmtChecker::WhyNotOk( + context_, x)}) { context_.Say(source, std::move(*msg)); } }, @@ -504,7 +543,7 @@ template class DeviceContextChecker { Check(DEREF(parser::Unwrap(x))); } void Check(const parser::Expr &expr) { - if (MaybeMsg msg{DeviceExprChecker{}(expr.typedExpr)}) { + if (MaybeMsg msg{DeviceExprChecker{context_}(expr.typedExpr)}) { context_.Say(expr.source, std::move(*msg)); } } diff --git a/flang/test/Semantics/cuf22.cuf b/flang/test/Semantics/cuf22.cuf new file mode 100644 index 0000000000000..36e0f0b2502df --- /dev/null +++ b/flang/test/Semantics/cuf22.cuf @@ -0,0 +1,8 @@ +! RUN: not bbc -fcuda -fcuda-disable-warp-function %s -o - 2>&1 | FileCheck %s + +attributes(device) subroutine testMatch() + integer :: a, ipred, mask, v32 + a = match_all_sync(mask, v32, ipred) +end subroutine + +! CHECK: warp match function disabled diff --git a/flang/tools/bbc/bbc.cpp b/flang/tools/bbc/bbc.cpp index c544008a24d56..c80872108ac8f 100644 --- a/flang/tools/bbc/bbc.cpp +++ b/flang/tools/bbc/bbc.cpp @@ -223,6 +223,11 @@ static llvm::cl::opt enableCUDA("fcuda", llvm::cl::desc("enable CUDA Fortran"), llvm::cl::init(false)); +static llvm::cl::opt + disableCUDAWarpFunction("fcuda-disable-warp-function", + llvm::cl::desc("Disable CUDA Warp Function"), + llvm::cl::init(false)); + static llvm::cl::opt enableGPUMode("gpu", llvm::cl::desc("Enable GPU Mode managed|unified"), llvm::cl::init("")); @@ -600,6 +605,11 @@ int main(int argc, char **argv) { options.features.Enable(Fortran::common::LanguageFeature::CUDA); } + if (disableCUDAWarpFunction) { + options.features.Enable( + Fortran::common::LanguageFeature::CudaWarpMatchFunction, false); + } + if (enableGPUMode == "managed") { options.features.Enable(Fortran::common::LanguageFeature::CudaManaged); } else if (enableGPUMode == "unified") { From 842377882a3f52e345668751fa6d46ba4f7268d2 Mon Sep 17 00:00:00 2001 From: Jim Lin Date: Wed, 11 Jun 2025 13:32:49 +0800 Subject: [PATCH 029/851] [RISCV] Select signed bitfield insert for XAndesPerf (#143356) This patch is similar to #142737 The XAndesPerf extension includes signed bitfield extraction instruction `NDS.BFOS, which can extract the bits from 0 to Len - 1, place them starting at bit Lsb, zero-filled the bits from 0 to Lsb -1, and sign-extend the result. When Lsb == Msb, it is a special case where the Lsb will be set to 0 instead of being equal to the Msb. --- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 56 +++++++++++++++++++++ llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h | 1 + llvm/test/CodeGen/RISCV/rv32xandesperf.ll | 26 ++++++++++ llvm/test/CodeGen/RISCV/rv64xandesperf.ll | 46 +++++++++++++++++ 4 files changed, 129 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index d298965595b47..4539efd591c8b 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -683,6 +683,59 @@ bool RISCVDAGToDAGISel::trySignedBitfieldExtract(SDNode *Node) { return false; } +bool RISCVDAGToDAGISel::trySignedBitfieldInsertInSign(SDNode *Node) { + // Only supported with XAndesPerf at the moment. + if (!Subtarget->hasVendorXAndesPerf()) + return false; + + auto *N1C = dyn_cast(Node->getOperand(1)); + if (!N1C) + return false; + + SDValue N0 = Node->getOperand(0); + if (!N0.hasOneUse()) + return false; + + auto BitfieldInsert = [&](SDValue N0, unsigned Msb, unsigned Lsb, + const SDLoc &DL, MVT VT) { + unsigned Opc = RISCV::NDS_BFOS; + // If the Lsb is equal to the Msb, then the Lsb should be 0. + if (Lsb == Msb) + Lsb = 0; + return CurDAG->getMachineNode(Opc, DL, VT, N0.getOperand(0), + CurDAG->getTargetConstant(Lsb, DL, VT), + CurDAG->getTargetConstant(Msb, DL, VT)); + }; + + SDLoc DL(Node); + MVT VT = Node->getSimpleValueType(0); + const unsigned RightShAmt = N1C->getZExtValue(); + + // Transform (sra (shl X, C1) C2) with C1 > C2 + // -> (NDS.BFOS X, lsb, msb) + if (N0.getOpcode() == ISD::SHL) { + auto *N01C = dyn_cast(N0->getOperand(1)); + if (!N01C) + return false; + + const unsigned LeftShAmt = N01C->getZExtValue(); + // Make sure that this is a bitfield insertion (i.e., the shift-right + // amount should be less than the left-shift). + if (LeftShAmt <= RightShAmt) + return false; + + const unsigned MsbPlusOne = VT.getSizeInBits() - RightShAmt; + const unsigned Msb = MsbPlusOne - 1; + const unsigned Lsb = LeftShAmt - RightShAmt; + + SDNode *Sbi = BitfieldInsert(N0, Msb, Lsb, DL, VT); + ReplaceNode(Node, Sbi); + return true; + } + + return false; +} + bool RISCVDAGToDAGISel::tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT, SDValue X, unsigned Msb, @@ -1214,6 +1267,9 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) { if (trySignedBitfieldExtract(Node)) return; + if (trySignedBitfieldInsertInSign(Node)) + return; + // Optimize (sra (sext_inreg X, i16), C) -> // (srai (slli X, (XLen-16), (XLen-16) + C) // And (sra (sext_inreg X, i8), C) -> diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h index abc0372d15c4f..cb63c21fd8fc9 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h @@ -77,6 +77,7 @@ class RISCVDAGToDAGISel : public SelectionDAGISel { bool tryShrinkShlLogicImm(SDNode *Node); bool trySignedBitfieldExtract(SDNode *Node); + bool trySignedBitfieldInsertInSign(SDNode *Node); bool tryUnsignedBitfieldExtract(SDNode *Node, const SDLoc &DL, MVT VT, SDValue X, unsigned Msb, unsigned Lsb); bool tryUnsignedBitfieldInsertInZero(SDNode *Node, const SDLoc &DL, MVT VT, diff --git a/llvm/test/CodeGen/RISCV/rv32xandesperf.ll b/llvm/test/CodeGen/RISCV/rv32xandesperf.ll index 3996420d477b2..3e7f09f3d6c22 100644 --- a/llvm/test/CodeGen/RISCV/rv32xandesperf.ll +++ b/llvm/test/CodeGen/RISCV/rv32xandesperf.ll @@ -154,6 +154,32 @@ define i32 @bfos_from_ashr_sexti16_i32(i16 %x) { ret i32 %ashr } +; MSB = 0 + +define i32 @bfos_from_ashr_shl_with_msb_zero_insert_i32(i32 %x) { +; CHECK-LABEL: bfos_from_ashr_shl_with_msb_zero_insert_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: nds.bfos a0, a0, 0, 14 +; CHECK-NEXT: ret + %shl = shl i32 %x, 31 + %lshr = ashr i32 %shl, 17 + ret i32 %lshr +} + +; MSB < LSB + +define i32 @bfos_from_ashr_shl_insert_i32(i32 %x) { +; CHECK-LABEL: bfos_from_ashr_shl_insert_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: nds.bfos a0, a0, 18, 20 +; CHECK-NEXT: ret + %shl = shl i32 %x, 29 + %lshr = ashr i32 %shl, 11 + ret i32 %lshr +} + +; sext + define i32 @sexti1_i32(i32 %a) { ; CHECK-LABEL: sexti1_i32: ; CHECK: # %bb.0: diff --git a/llvm/test/CodeGen/RISCV/rv64xandesperf.ll b/llvm/test/CodeGen/RISCV/rv64xandesperf.ll index af7c300a92d1f..98cda42665169 100644 --- a/llvm/test/CodeGen/RISCV/rv64xandesperf.ll +++ b/llvm/test/CodeGen/RISCV/rv64xandesperf.ll @@ -212,6 +212,52 @@ define i64 @bfos_from_ashr_sexti16_i64(i16 %x) { ret i64 %ashr } +; MSB = 0 + +define i32 @bfos_from_ashr_shl_with_msb_zero_insert_i32(i32 %x) { +; CHECK-LABEL: bfos_from_ashr_shl_with_msb_zero_insert_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: nds.bfos a0, a0, 0, 14 +; CHECK-NEXT: ret + %shl = shl i32 %x, 31 + %lshr = ashr i32 %shl, 17 + ret i32 %lshr +} + +define i64 @bfos_from_ashr_shl_with_msb_zero_insert_i64(i64 %x) { +; CHECK-LABEL: bfos_from_ashr_shl_with_msb_zero_insert_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: nds.bfos a0, a0, 0, 46 +; CHECK-NEXT: ret + %shl = shl i64 %x, 63 + %lshr = ashr i64 %shl, 17 + ret i64 %lshr +} + +; MSB < LSB + +define i32 @bfos_from_ashr_shl_insert_i32(i32 %x) { +; CHECK-LABEL: bfos_from_ashr_shl_insert_i32: +; CHECK: # %bb.0: +; CHECK-NEXT: nds.bfos a0, a0, 18, 20 +; CHECK-NEXT: ret + %shl = shl i32 %x, 29 + %lshr = ashr i32 %shl, 11 + ret i32 %lshr +} + +define i64 @bfos_from_ashr_shl_insert_i64(i64 %x) { +; CHECK-LABEL: bfos_from_ashr_shl_insert_i64: +; CHECK: # %bb.0: +; CHECK-NEXT: nds.bfos a0, a0, 18, 52 +; CHECK-NEXT: ret + %shl = shl i64 %x, 29 + %lshr = ashr i64 %shl, 11 + ret i64 %lshr +} + +; sext + define signext i32 @sexti1_i32(i32 signext %a) { ; CHECK-LABEL: sexti1_i32: ; CHECK: # %bb.0: From c2cb571c6cbcec75ab401974348f9f0d9b2190db Mon Sep 17 00:00:00 2001 From: Shafik Yaghmour Date: Tue, 10 Jun 2025 23:41:41 -0700 Subject: [PATCH 030/851] [Clang][NFC] Move UntypedParameters instead of copy (#143646) Static analysis flagged that UntypedParameters could be moved instead of copied. This would avoid copying a large object. --- clang/lib/Sema/SemaExprCXX.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp index 2546ab5c0a342..c106ea749170f 100644 --- a/clang/lib/Sema/SemaExprCXX.cpp +++ b/clang/lib/Sema/SemaExprCXX.cpp @@ -2888,7 +2888,7 @@ static bool resolveAllocationOverload( // type-identity-less argument list. IAP.PassTypeIdentity = TypeAwareAllocationMode::No; IAP.PassAlignment = InitialAlignmentMode; - Args = UntypedParameters; + Args = std::move(UntypedParameters); } assert(!S.isStdTypeIdentity(Args[0]->getType(), nullptr)); return resolveAllocationOverloadInterior( From a17e97e6778b2cd4114052faf6ee25db330ef405 Mon Sep 17 00:00:00 2001 From: maflcko <6399679+maflcko@users.noreply.github.com> Date: Wed, 11 Jun 2025 08:43:23 +0200 Subject: [PATCH 031/851] [libc++] Add missing C++20 [time.point.arithmetic] (#143165) This was part of https://wg21.link/p0355r7, but apparently never implemented. --------- Co-authored-by: MarcoFalke <*~=`'#}+{/-|&$^_@721217.xyz> Co-authored-by: Hristo Hristov --- libcxx/include/__chrono/time_point.h | 13 +++++++ libcxx/include/chrono | 5 +++ .../time.point.arithmetic/op_++.pass.cpp | 35 +++++++++++++++++++ .../time.point.arithmetic/op_++int.pass.cpp | 35 +++++++++++++++++++ .../time.point.arithmetic/op_--.pass.cpp | 35 +++++++++++++++++++ .../time.point.arithmetic/op_--int.pass.cpp | 35 +++++++++++++++++++ 6 files changed, 158 insertions(+) create mode 100644 libcxx/test/std/time/time.point/time.point.arithmetic/op_++.pass.cpp create mode 100644 libcxx/test/std/time/time.point/time.point.arithmetic/op_++int.pass.cpp create mode 100644 libcxx/test/std/time/time.point/time.point.arithmetic/op_--.pass.cpp create mode 100644 libcxx/test/std/time/time.point/time.point.arithmetic/op_--int.pass.cpp diff --git a/libcxx/include/__chrono/time_point.h b/libcxx/include/__chrono/time_point.h index 6b866b882f89a..fc4408d23dbf1 100644 --- a/libcxx/include/__chrono/time_point.h +++ b/libcxx/include/__chrono/time_point.h @@ -58,6 +58,19 @@ class time_point { // arithmetic +#if _LIBCPP_STD_VER >= 20 + _LIBCPP_HIDE_FROM_ABI constexpr time_point& operator++() { + ++__d_; + return *this; + } + _LIBCPP_HIDE_FROM_ABI constexpr time_point operator++(int) { return time_point{__d_++}; } + _LIBCPP_HIDE_FROM_ABI constexpr time_point& operator--() { + --__d_; + return *this; + } + _LIBCPP_HIDE_FROM_ABI constexpr time_point operator--(int) { return time_point{__d_--}; } +#endif // _LIBCPP_STD_VER >= 20 + _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX17 time_point& operator+=(const duration& __d) { __d_ += __d; return *this; diff --git a/libcxx/include/chrono b/libcxx/include/chrono index cd9b98872083e..82e99a31bcc9f 100644 --- a/libcxx/include/chrono +++ b/libcxx/include/chrono @@ -132,6 +132,11 @@ public: // arithmetic + constexpr time_point& operator++(); // C++20 + constexpr time_point operator++(int); // C++20 + constexpr time_point& operator--(); // C++20 + constexpr time_point operator--(int); // C++20 + time_point& operator+=(const duration& d); // constexpr in C++17 time_point& operator-=(const duration& d); // constexpr in C++17 diff --git a/libcxx/test/std/time/time.point/time.point.arithmetic/op_++.pass.cpp b/libcxx/test/std/time/time.point/time.point.arithmetic/op_++.pass.cpp new file mode 100644 index 0000000000000..e035d7ef4fa0e --- /dev/null +++ b/libcxx/test/std/time/time.point/time.point.arithmetic/op_++.pass.cpp @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: std-at-least-c++20 + +// + +// time_point + +// constexpr time_point& operator++(); + +#include +#include + +#include "test_macros.h" + +constexpr bool test() { + using Clock = std::chrono::system_clock; + using Duration = std::chrono::milliseconds; + std::chrono::time_point t{Duration{5}}; + std::chrono::time_point& tref{++t}; + assert(&tref == &t); + assert(tref.time_since_epoch() == Duration{6}); + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + return 0; +} diff --git a/libcxx/test/std/time/time.point/time.point.arithmetic/op_++int.pass.cpp b/libcxx/test/std/time/time.point/time.point.arithmetic/op_++int.pass.cpp new file mode 100644 index 0000000000000..5304d37d5c361 --- /dev/null +++ b/libcxx/test/std/time/time.point/time.point.arithmetic/op_++int.pass.cpp @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: std-at-least-c++20 + +// + +// time_point + +// constexpr time_point operator++(int); + +#include +#include + +#include "test_macros.h" + +constexpr bool test() { + using Clock = std::chrono::system_clock; + using Duration = std::chrono::milliseconds; + std::chrono::time_point t1{Duration{3}}; + std::chrono::time_point t2{t1++}; + assert(t1.time_since_epoch() == Duration{4}); + assert(t2.time_since_epoch() == Duration{3}); + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + return 0; +} diff --git a/libcxx/test/std/time/time.point/time.point.arithmetic/op_--.pass.cpp b/libcxx/test/std/time/time.point/time.point.arithmetic/op_--.pass.cpp new file mode 100644 index 0000000000000..915156fcc6b8c --- /dev/null +++ b/libcxx/test/std/time/time.point/time.point.arithmetic/op_--.pass.cpp @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: std-at-least-c++20 + +// + +// time_point + +// constexpr time_point& operator--(); + +#include +#include + +#include "test_macros.h" + +constexpr bool test() { + using Clock = std::chrono::system_clock; + using Duration = std::chrono::milliseconds; + std::chrono::time_point t{Duration{5}}; + std::chrono::time_point& tref{--t}; + assert(&tref == &t); + assert(tref.time_since_epoch() == Duration{4}); + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + return 0; +} diff --git a/libcxx/test/std/time/time.point/time.point.arithmetic/op_--int.pass.cpp b/libcxx/test/std/time/time.point/time.point.arithmetic/op_--int.pass.cpp new file mode 100644 index 0000000000000..cc5f462106bbf --- /dev/null +++ b/libcxx/test/std/time/time.point/time.point.arithmetic/op_--int.pass.cpp @@ -0,0 +1,35 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// REQUIRES: std-at-least-c++20 + +// + +// time_point + +// constexpr time_point operator--(int); + +#include +#include + +#include "test_macros.h" + +constexpr bool test() { + using Clock = std::chrono::system_clock; + using Duration = std::chrono::milliseconds; + std::chrono::time_point t1{Duration{3}}; + std::chrono::time_point t2{t1--}; + assert(t1.time_since_epoch() == Duration{2}); + assert(t2.time_since_epoch() == Duration{3}); + return true; +} + +int main(int, char**) { + test(); + static_assert(test()); + return 0; +} From 0f3c54a3b3289b6375a1d32684e831cb407af003 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 10 Jun 2025 21:33:53 +0100 Subject: [PATCH 032/851] [X86] Add test coverage showing failure to merge "zero input passthrough" behaviour for BSR instructions on x86_64 targets --- llvm/test/CodeGen/X86/bsr.ll | 492 +++++++++++++++++++++++++++++++++++ 1 file changed, 492 insertions(+) create mode 100644 llvm/test/CodeGen/X86/bsr.ll diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll new file mode 100644 index 0000000000000..1247b3ec59324 --- /dev/null +++ b/llvm/test/CodeGen/X86/bsr.ll @@ -0,0 +1,492 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64 + +define i8 @cmov_bsr8(i8 %x, i8 %y) nounwind { +; X86-LABEL: cmov_bsr8: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: testb %cl, %cl +; X86-NEXT: je .LBB0_1 +; X86-NEXT: # %bb.2: # %cond.false +; X86-NEXT: movzbl %cl, %eax +; X86-NEXT: bsrl %eax, %eax +; X86-NEXT: xorl $7, %eax +; X86-NEXT: testb %cl, %cl +; X86-NEXT: je .LBB0_4 +; X86-NEXT: .LBB0_5: # %cond.end +; X86-NEXT: xorb $7, %al +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; X86-NEXT: .LBB0_1: +; X86-NEXT: movb $8, %al +; X86-NEXT: testb %cl, %cl +; X86-NEXT: jne .LBB0_5 +; X86-NEXT: .LBB0_4: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; +; X64-LABEL: cmov_bsr8: +; X64: # %bb.0: +; X64-NEXT: movzbl %dil, %ecx +; X64-NEXT: movl $15, %eax +; X64-NEXT: bsrl %ecx, %eax +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %1 = tail call i8 @llvm.ctlz.i8(i8 %x, i1 false) + %2 = xor i8 %1, 7 + %3 = icmp eq i8 %x, 0 + %4 = select i1 %3, i8 %y, i8 %2 + ret i8 %4 +} + +define i8 @cmov_bsr8_undef(i8 %x, i8 %y) nounwind { +; X86-LABEL: cmov_bsr8_undef: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: jne .LBB1_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; X86-NEXT: .LBB1_1: +; X86-NEXT: bsrl %eax, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; +; X64-LABEL: cmov_bsr8_undef: +; X64: # %bb.0: +; X64-NEXT: movzbl %dil, %ecx +; X64-NEXT: bsrl %ecx, %eax +; X64-NEXT: testb %cl, %cl +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %1 = tail call i8 @llvm.ctlz.i8(i8 %x, i1 true) + %2 = xor i8 %1, 7 + %3 = icmp ne i8 %x, 0 + %4 = select i1 %3, i8 %2, i8 %y + ret i8 %4 +} + +define i16 @cmov_bsr16(i16 %x, i16 %y) nounwind { +; X86-LABEL: cmov_bsr16: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testw %ax, %ax +; X86-NEXT: je .LBB2_1 +; X86-NEXT: # %bb.2: # %cond.false +; X86-NEXT: bsrw %ax, %cx +; X86-NEXT: xorl $15, %ecx +; X86-NEXT: testw %ax, %ax +; X86-NEXT: jne .LBB2_4 +; X86-NEXT: .LBB2_5: # %cond.end +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; X86-NEXT: .LBB2_1: +; X86-NEXT: movw $16, %cx +; X86-NEXT: testw %ax, %ax +; X86-NEXT: je .LBB2_5 +; X86-NEXT: .LBB2_4: +; X86-NEXT: movzwl %cx, %eax +; X86-NEXT: xorl $15, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: cmov_bsr16: +; X64: # %bb.0: +; X64-NEXT: movw $31, %ax +; X64-NEXT: bsrw %di, %ax +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %1 = tail call i16 @llvm.ctlz.i16(i16 %x, i1 false) + %2 = xor i16 %1, 15 + %3 = icmp ne i16 %x, 0 + %4 = select i1 %3, i16 %2, i16 %y + ret i16 %4 +} + +define i16 @cmov_bsr16_undef(i16 %x, i16 %y) nounwind { +; X86-LABEL: cmov_bsr16_undef: +; X86: # %bb.0: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testw %ax, %ax +; X86-NEXT: je .LBB3_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: bsrw %ax, %ax +; X86-NEXT: retl +; X86-NEXT: .LBB3_1: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl +; +; X64-LABEL: cmov_bsr16_undef: +; X64: # %bb.0: +; X64-NEXT: bsrw %di, %ax +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %1 = tail call i16 @llvm.ctlz.i16(i16 %x, i1 true) + %2 = xor i16 %1, 15 + %3 = icmp eq i16 %x, 0 + %4 = select i1 %3, i16 %y, i16 %2 + ret i16 %4 +} + +define i32 @cmov_bsr32(i32 %x, i32 %y) nounwind { +; X86-LABEL: cmov_bsr32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: je .LBB4_1 +; X86-NEXT: # %bb.2: # %cond.false +; X86-NEXT: bsrl %ecx, %eax +; X86-NEXT: xorl $31, %eax +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: je .LBB4_4 +; X86-NEXT: .LBB4_5: # %cond.end +; X86-NEXT: xorl $31, %eax +; X86-NEXT: retl +; X86-NEXT: .LBB4_1: +; X86-NEXT: movl $32, %eax +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: jne .LBB4_5 +; X86-NEXT: .LBB4_4: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl +; +; X64-LABEL: cmov_bsr32: +; X64: # %bb.0: +; X64-NEXT: movl $63, %eax +; X64-NEXT: bsrl %edi, %eax +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: retq + %1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false) + %2 = xor i32 %1, 31 + %3 = icmp eq i32 %x, 0 + %4 = select i1 %3, i32 %y, i32 %2 + ret i32 %4 +} + +define i32 @cmov_bsr32_undef(i32 %x, i32 %y) nounwind { +; X86-LABEL: cmov_bsr32_undef: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: jne .LBB5_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl +; X86-NEXT: .LBB5_1: +; X86-NEXT: bsrl %eax, %eax +; X86-NEXT: retl +; +; X64-LABEL: cmov_bsr32_undef: +; X64: # %bb.0: +; X64-NEXT: bsrl %edi, %eax +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: retq + %1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true) + %2 = xor i32 %1, 31 + %3 = icmp ne i32 %x, 0 + %4 = select i1 %3, i32 %2, i32 %y + ret i32 %4 +} + +define i64 @cmov_bsr64(i64 %x, i64 %y) nounwind { +; X86-LABEL: cmov_bsr64: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: je .LBB6_1 +; X86-NEXT: # %bb.2: # %cond.false +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: jne .LBB6_3 +; X86-NEXT: # %bb.4: # %cond.false +; X86-NEXT: bsrl %esi, %eax +; X86-NEXT: xorl $31, %eax +; X86-NEXT: orl $32, %eax +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: je .LBB6_7 +; X86-NEXT: jmp .LBB6_6 +; X86-NEXT: .LBB6_1: +; X86-NEXT: movl $64, %eax +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: jne .LBB6_6 +; X86-NEXT: .LBB6_7: # %cond.end +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB6_3: +; X86-NEXT: bsrl %ecx, %eax +; X86-NEXT: xorl $31, %eax +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: je .LBB6_7 +; X86-NEXT: .LBB6_6: +; X86-NEXT: xorl $63, %eax +; X86-NEXT: popl %esi +; X86-NEXT: retl +; +; X64-LABEL: cmov_bsr64: +; X64: # %bb.0: +; X64-NEXT: movl $127, %eax +; X64-NEXT: bsrq %rdi, %rax +; X64-NEXT: cmoveq %rsi, %rax +; X64-NEXT: retq + %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false) + %2 = xor i64 %1, 63 + %3 = icmp ne i64 %x, 0 + %4 = select i1 %3, i64 %2, i64 %y + ret i64 %4 +} + +define i64 @cmov_bsr64_undef(i64 %x, i64 %y) nounwind { +; X86-LABEL: cmov_bsr64_undef: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: jne .LBB7_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: bsrl %ecx, %eax +; X86-NEXT: xorl $31, %eax +; X86-NEXT: orl $32, %eax +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: jne .LBB7_5 +; X86-NEXT: .LBB7_4: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: retl +; X86-NEXT: .LBB7_1: +; X86-NEXT: bsrl %edx, %eax +; X86-NEXT: xorl $31, %eax +; X86-NEXT: orl %edx, %ecx +; X86-NEXT: je .LBB7_4 +; X86-NEXT: .LBB7_5: +; X86-NEXT: xorl $63, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: retl +; +; X64-LABEL: cmov_bsr64_undef: +; X64: # %bb.0: +; X64-NEXT: bsrq %rdi, %rax +; X64-NEXT: cmoveq %rsi, %rax +; X64-NEXT: retq + %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true) + %2 = xor i64 %1, 63 + %3 = icmp eq i64 %x, 0 + %4 = select i1 %3, i64 %y, i64 %2 + ret i64 %4 +} + +define i128 @cmov_bsr128(i128 %x, i128 %y) nounwind { +; X86-LABEL: cmov_bsr128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: pushl %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: orl %ebp, %edx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: je .LBB8_1 +; X86-NEXT: # %bb.2: # %cond.false +; X86-NEXT: testl %ebp, %ebp +; X86-NEXT: jne .LBB8_3 +; X86-NEXT: # %bb.4: # %cond.false +; X86-NEXT: bsrl %ebx, %edx +; X86-NEXT: xorl $31, %edx +; X86-NEXT: orl $32, %edx +; X86-NEXT: testl %edi, %edi +; X86-NEXT: je .LBB8_7 +; X86-NEXT: .LBB8_6: +; X86-NEXT: bsrl %edi, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: jmp .LBB8_8 +; X86-NEXT: .LBB8_1: +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: movl $0, (%esp) # 4-byte Folded Spill +; X86-NEXT: movl $128, %edx +; X86-NEXT: jmp .LBB8_11 +; X86-NEXT: .LBB8_3: +; X86-NEXT: bsrl %ebp, %edx +; X86-NEXT: xorl $31, %edx +; X86-NEXT: testl %edi, %edi +; X86-NEXT: jne .LBB8_6 +; X86-NEXT: .LBB8_7: # %cond.false +; X86-NEXT: bsrl %ecx, %esi +; X86-NEXT: xorl $31, %esi +; X86-NEXT: orl $32, %esi +; X86-NEXT: .LBB8_8: # %cond.false +; X86-NEXT: movl %ebx, %eax +; X86-NEXT: orl %ebp, %ebx +; X86-NEXT: jne .LBB8_10 +; X86-NEXT: # %bb.9: # %cond.false +; X86-NEXT: orl $64, %esi +; X86-NEXT: movl %esi, %edx +; X86-NEXT: .LBB8_10: # %cond.false +; X86-NEXT: movl $0, (%esp) # 4-byte Folded Spill +; X86-NEXT: .LBB8_11: # %cond.end +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: orl %ebp, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: je .LBB8_12 +; X86-NEXT: # %bb.13: # %cond.end +; X86-NEXT: xorl $127, %edx +; X86-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NEXT: jmp .LBB8_14 +; X86-NEXT: .LBB8_12: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: .LBB8_14: # %cond.end +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %ebx, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %ecx, 4(%eax) +; X86-NEXT: movl %edx, (%eax) +; X86-NEXT: addl $4, %esp +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; +; X64-LABEL: cmov_bsr128: +; X64: # %bb.0: +; X64-NEXT: bsrq %rsi, %r8 +; X64-NEXT: xorq $63, %r8 +; X64-NEXT: movl $127, %eax +; X64-NEXT: bsrq %rdi, %rax +; X64-NEXT: xorq $63, %rax +; X64-NEXT: addq $64, %rax +; X64-NEXT: testq %rsi, %rsi +; X64-NEXT: cmovneq %r8, %rax +; X64-NEXT: xorq $127, %rax +; X64-NEXT: xorl %r8d, %r8d +; X64-NEXT: orq %rsi, %rdi +; X64-NEXT: cmoveq %rdx, %rax +; X64-NEXT: cmoveq %rcx, %r8 +; X64-NEXT: movq %r8, %rdx +; X64-NEXT: retq + %1 = tail call i128 @llvm.ctlz.i128(i128 %x, i1 false) + %2 = xor i128 %1, 127 + %3 = icmp eq i128 %x, 0 + %4 = select i1 %3, i128 %y, i128 %2 + ret i128 %4 +} + +define i128 @cmov_bsr128_undef(i128 %x, i128 %y) nounwind { +; X86-LABEL: cmov_bsr128_undef: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: testl %edi, %edi +; X86-NEXT: jne .LBB9_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: bsrl %esi, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: orl $32, %ecx +; X86-NEXT: jmp .LBB9_3 +; X86-NEXT: .LBB9_1: +; X86-NEXT: bsrl %edi, %ecx +; X86-NEXT: xorl $31, %ecx +; X86-NEXT: .LBB9_3: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: testl %edx, %edx +; X86-NEXT: jne .LBB9_4 +; X86-NEXT: # %bb.5: +; X86-NEXT: bsrl %ebx, %ebp +; X86-NEXT: xorl $31, %ebp +; X86-NEXT: orl $32, %ebp +; X86-NEXT: jmp .LBB9_6 +; X86-NEXT: .LBB9_4: +; X86-NEXT: bsrl %edx, %ebp +; X86-NEXT: xorl $31, %ebp +; X86-NEXT: .LBB9_6: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl %edi, %esi +; X86-NEXT: jne .LBB9_8 +; X86-NEXT: # %bb.7: +; X86-NEXT: orl $64, %ebp +; X86-NEXT: movl %ebp, %ecx +; X86-NEXT: .LBB9_8: +; X86-NEXT: orl %edi, %edx +; X86-NEXT: orl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: orl %edx, %ebx +; X86-NEXT: jne .LBB9_9 +; X86-NEXT: # %bb.10: +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: jmp .LBB9_11 +; X86-NEXT: .LBB9_9: +; X86-NEXT: xorl $127, %ecx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: xorl %esi, %esi +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: .LBB9_11: +; X86-NEXT: movl %edi, 12(%eax) +; X86-NEXT: movl %esi, 8(%eax) +; X86-NEXT: movl %edx, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; +; X64-LABEL: cmov_bsr128_undef: +; X64: # %bb.0: +; X64-NEXT: bsrq %rsi, %r8 +; X64-NEXT: xorq $63, %r8 +; X64-NEXT: bsrq %rdi, %rax +; X64-NEXT: xorq $63, %rax +; X64-NEXT: orq $64, %rax +; X64-NEXT: testq %rsi, %rsi +; X64-NEXT: cmovneq %r8, %rax +; X64-NEXT: xorq $127, %rax +; X64-NEXT: xorl %r8d, %r8d +; X64-NEXT: orq %rsi, %rdi +; X64-NEXT: cmoveq %rdx, %rax +; X64-NEXT: cmoveq %rcx, %r8 +; X64-NEXT: movq %r8, %rdx +; X64-NEXT: retq + %1 = tail call i128 @llvm.ctlz.i128(i128 %x, i1 true) + %2 = xor i128 %1, 127 + %3 = icmp ne i128 %x, 0 + %4 = select i1 %3, i128 %2, i128 %y + ret i128 %4 +} + +declare i8 @llvm.ctlz.i8(i8, i1) +declare i16 @llvm.ctlz.i16(i16, i1) +declare i32 @llvm.ctlz.i32(i32, i1) +declare i64 @llvm.ctlz.i64(i64, i1) +declare i128 @llvm.ctlz.i128(i128, i1) From a6ace2801e8900a6fe8c3b8295938f3b3c1e4466 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 11 Jun 2025 08:07:30 +0100 Subject: [PATCH 033/851] [X86] combineConcatVectorOps - ensure we're only concatenating v2f64 generic shuffles into vXf64 vshufpd Identified while triaging #143606 - we can't concat v4f64 lhs/rhs subvecs and then expect the v2f64 operands to be in the correct place for VSHUFPD Test coverage will follow --- llvm/lib/Target/X86/X86ISelLowering.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 96be91256915d..8bcd8670879a9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -59383,7 +59383,8 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, // We can always convert per-lane vXf64 shuffles into VSHUFPD. if (!IsSplat && - (VT == MVT::v4f64 || (VT == MVT::v8f64 && Subtarget.useAVX512Regs())) && + ((NumOps == 2 && VT == MVT::v4f64) || + (NumOps == 4 && VT == MVT::v8f64 && Subtarget.useAVX512Regs())) && all_of(Ops, [](SDValue Op) { return Op.hasOneUse(); })) { // Collect the individual per-lane v2f64/v4f64 shuffles. MVT OpVT = Ops[0].getSimpleValueType(); From 32ac7dc2d21843091116b636777c174830cd2dd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= Date: Wed, 11 Jun 2025 09:24:03 +0200 Subject: [PATCH 034/851] [test][AArch64] Adjust vector insertion lit tests (#143101) The test cases test_insert_v16i8_insert_2_undef_base and test_insert_v16i8_insert_2_undef_base_different_valeus in CodeGen/AArch64/arm64-vector-insertion.ll was leaving element 8 in the vector as "undef" without any real explanation. It kind of looked like a typo as the input IR looked like this %v.8 = insertelement <16 x i8> %v.7, i8 %a, i32 8 %v.10 = insertelement <16 x i8> %v.7, i8 %a, i32 10 leaving %v.8 as unused. This patch is cleaning up the tests a bit by adding separate test cases to validate what is happening when skipping insert at index 8, while amending the original tests cases to use %v.8 instead of %v.7 when creating %v.10. --- .../CodeGen/AArch64/arm64-vector-insertion.ll | 69 ++++++++++++++++++- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll index 94074d1689f6a..ff28c7817d143 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vector-insertion.ll @@ -66,6 +66,35 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base(i8 %a) { %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6 %v.7 = insertelement <16 x i8> %v.6, i8 %a, i32 7 %v.8 = insertelement <16 x i8> %v.7, i8 %a, i32 8 + %v.10 = insertelement <16 x i8> %v.8, i8 %a, i32 10 + %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11 + %v.12 = insertelement <16 x i8> %v.11, i8 %a, i32 12 + %v.13 = insertelement <16 x i8> %v.12, i8 %a, i32 13 + %v.14 = insertelement <16 x i8> %v.13, i8 %a, i32 14 + %v.15 = insertelement <16 x i8> %v.14, i8 %a, i32 15 + ret <16 x i8> %v.15 +} + +; Similar to above, but we leave element 8 as undef. One interesting part with +; this test case is that %a may be poison, so simply inserting %a also at +; index 8 would make the result vector more poisonous. +define <16 x i8> @test_insert_v16i8_insert_2_undef_base_skip8(i32 %a0) { +; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_skip8: +; CHECK: // %bb.0: +; CHECK-NEXT: lsr w8, w0, #5 +; CHECK-NEXT: dup.16b v0, w8 +; CHECK-NEXT: mov.b v0[5], wzr +; CHECK-NEXT: mov.b v0[9], wzr +; CHECK-NEXT: ret + %a1 = lshr exact i32 %a0, 5 + %a = trunc i32 %a1 to i8 + %v.0 = insertelement <16 x i8> , i8 %a, i32 0 + %v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1 + %v.2 = insertelement <16 x i8> %v.1, i8 %a, i32 2 + %v.3 = insertelement <16 x i8> %v.2, i8 %a, i32 3 + %v.4 = insertelement <16 x i8> %v.3, i8 %a, i32 4 + %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6 + %v.7 = insertelement <16 x i8> %v.6, i8 %a, i32 7 %v.10 = insertelement <16 x i8> %v.7, i8 %a, i32 10 %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11 %v.12 = insertelement <16 x i8> %v.11, i8 %a, i32 12 @@ -75,8 +104,8 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base(i8 %a) { ret <16 x i8> %v.15 } -define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_valeus(i8 %a, i8 %b) { -; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_valeus: +define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_values(i8 %a, i8 %b) { +; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_values: ; CHECK: // %bb.0: ; CHECK-NEXT: dup.16b v0, w0 ; CHECK-NEXT: mov.b v0[2], w1 @@ -94,6 +123,42 @@ define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_valeus(i8 %a, %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6 %v.7 = insertelement <16 x i8> %v.6, i8 %b, i32 7 %v.8 = insertelement <16 x i8> %v.7, i8 %a, i32 8 + %v.10 = insertelement <16 x i8> %v.8, i8 %a, i32 10 + %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11 + %v.12 = insertelement <16 x i8> %v.11, i8 %b, i32 12 + %v.13 = insertelement <16 x i8> %v.12, i8 %a, i32 13 + %v.14 = insertelement <16 x i8> %v.13, i8 %a, i32 14 + %v.15 = insertelement <16 x i8> %v.14, i8 %b, i32 15 + ret <16 x i8> %v.15 +} + +; Similar to above, but we leave element 8 as undef. One interesting part with +; this test case is that %a and %b may be poison, so simply inserting %a or %b +; at index 8 would make the result vector more poisonous. +define <16 x i8> @test_insert_v16i8_insert_2_undef_base_different_values_skip8(i32 %a0, i32 %b0) { +; CHECK-LABEL: test_insert_v16i8_insert_2_undef_base_different_values_skip8: +; CHECK: // %bb.0: +; CHECK-NEXT: lsr w8, w0, #5 +; CHECK-NEXT: dup.16b v0, w8 +; CHECK-NEXT: lsr w8, w1, #5 +; CHECK-NEXT: mov.b v0[2], w8 +; CHECK-NEXT: mov.b v0[5], wzr +; CHECK-NEXT: mov.b v0[7], w8 +; CHECK-NEXT: mov.b v0[9], wzr +; CHECK-NEXT: mov.b v0[12], w8 +; CHECK-NEXT: mov.b v0[15], w8 +; CHECK-NEXT: ret + %a1 = lshr exact i32 %a0, 5 + %a = trunc i32 %a1 to i8 + %b1 = lshr exact i32 %b0, 5 + %b = trunc i32 %b1 to i8 + %v.0 = insertelement <16 x i8> , i8 %a, i32 0 + %v.1 = insertelement <16 x i8> %v.0, i8 %a, i32 1 + %v.2 = insertelement <16 x i8> %v.1, i8 %b, i32 2 + %v.3 = insertelement <16 x i8> %v.2, i8 %a, i32 3 + %v.4 = insertelement <16 x i8> %v.3, i8 %a, i32 4 + %v.6 = insertelement <16 x i8> %v.4, i8 %a, i32 6 + %v.7 = insertelement <16 x i8> %v.6, i8 %b, i32 7 %v.10 = insertelement <16 x i8> %v.7, i8 %a, i32 10 %v.11 = insertelement <16 x i8> %v.10, i8 %a, i32 11 %v.12 = insertelement <16 x i8> %v.11, i8 %b, i32 12 From 686ec6cfe86367c43dccd83d7e6e2bac7e6a73a0 Mon Sep 17 00:00:00 2001 From: Paschalis Mpeis Date: Wed, 11 Jun 2025 08:24:10 +0100 Subject: [PATCH 035/851] [BOLT][AArch64] Fix adr-relaxation.s test (#143151) On some AArch64 machines the splitting was inconsistent. This causes cold `foo` to have a `mov` instruction before adrp. ``` : mov x0, #0x0 // =0 adrp x1, 0x600000 <_start> add x1, x1, #0x14 ret ``` This patch removes the `mov` instruction right above .L2, making splitting deterministic. --- bolt/test/AArch64/adr-relaxation.s | 1 - 1 file changed, 1 deletion(-) diff --git a/bolt/test/AArch64/adr-relaxation.s b/bolt/test/AArch64/adr-relaxation.s index a643a62339ba3..864650c3287d8 100644 --- a/bolt/test/AArch64/adr-relaxation.s +++ b/bolt/test/AArch64/adr-relaxation.s @@ -34,7 +34,6 @@ foo: .cfi_startproc cmp x1, x11 b.hi .L2 - mov x0, #0x0 .L2: # CHECK-FOO: : # CHECK-FOO-NEXT: adrp From 521e6ce5c8fdfc72cccc1accd78a59f1a5e2805a Mon Sep 17 00:00:00 2001 From: Baranov Victor Date: Wed, 11 Jun 2025 10:25:29 +0300 Subject: [PATCH 036/851] [CI] Add mention of LLVM Developer Policy in email-check message (NFC) (#143300) As for now, It may be hard for people to get truth from long Discourse discussion, so a link to official document may be enough to convince changing email from private to public. --- .github/workflows/email-check.yaml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/email-check.yaml b/.github/workflows/email-check.yaml index f4481d5cf5583..904ad718f97dd 100644 --- a/.github/workflows/email-check.yaml +++ b/.github/workflows/email-check.yaml @@ -32,7 +32,8 @@ jobs: COMMENT: >- ⚠️ We detected that you are using a GitHub private e-mail address to contribute to the repo.
Please turn off [Keep my email addresses private](https://github.com/settings/emails) setting in your account.
- See [LLVM Discourse](https://discourse.llvm.org/t/hidden-emails-on-github-should-we-do-something-about-it) for more information. + See [LLVM Developer Policy](https://llvm.org/docs/DeveloperPolicy.html#email-addresses) and + [LLVM Discourse](https://discourse.llvm.org/t/hidden-emails-on-github-should-we-do-something-about-it) for more information. run: | cat << EOF > comments [{"body" : "$COMMENT"}] From 17f1dac805d388596be5e8c316c0f14b3222da4e Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 11 Jun 2025 08:12:42 +0100 Subject: [PATCH 037/851] [X86] Add test coverage showing failure to merge "zero input passthrough" behaviour for BSF instructions on x86_64 targets --- llvm/test/CodeGen/X86/bsf.ll | 452 +++++++++++++++++++++++++++++++++++ 1 file changed, 452 insertions(+) create mode 100644 llvm/test/CodeGen/X86/bsf.ll diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll new file mode 100644 index 0000000000000..58929115baf54 --- /dev/null +++ b/llvm/test/CodeGen/X86/bsf.ll @@ -0,0 +1,452 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64 + +define i8 @cmov_bsf8(i8 %x, i8 %y) nounwind { +; X86-LABEL: cmov_bsf8: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb %al, %al +; X86-NEXT: je .LBB0_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: orl $256, %eax # imm = 0x100 +; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; X86-NEXT: .LBB0_1: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; +; X64-LABEL: cmov_bsf8: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: orl $256, %eax # imm = 0x100 +; X64-NEXT: rep bsfl %eax, %eax +; X64-NEXT: testb %dil, %dil +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %1 = tail call i8 @llvm.cttz.i8(i8 %x, i1 false) + %2 = icmp eq i8 %x, 0 + %3 = select i1 %2, i8 %y, i8 %1 + ret i8 %3 +} + +define i8 @cmov_bsf8_undef(i8 %x, i8 %y) nounwind { +; X86-LABEL: cmov_bsf8_undef: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testb %al, %al +; X86-NEXT: je .LBB1_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; X86-NEXT: .LBB1_1: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $al killed $al killed $eax +; X86-NEXT: retl +; +; X64-LABEL: cmov_bsf8_undef: +; X64: # %bb.0: +; X64-NEXT: rep bsfl %edi, %eax +; X64-NEXT: testb %dil, %dil +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: # kill: def $al killed $al killed $eax +; X64-NEXT: retq + %1 = tail call i8 @llvm.cttz.i8(i8 %x, i1 true) + %2 = icmp eq i8 %x, 0 + %3 = select i1 %2, i8 %y, i8 %1 + ret i8 %3 +} + +define i16 @cmov_bsf16(i16 %x, i16 %y) nounwind { +; X86-LABEL: cmov_bsf16: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testw %ax, %ax +; X86-NEXT: je .LBB2_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: orl $65536, %eax # imm = 0x10000 +; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; X86-NEXT: .LBB2_1: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: cmov_bsf16: +; X64: # %bb.0: +; X64-NEXT: movl %edi, %eax +; X64-NEXT: orl $65536, %eax # imm = 0x10000 +; X64-NEXT: rep bsfl %eax, %eax +; X64-NEXT: testw %di, %di +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %1 = tail call i16 @llvm.cttz.i16(i16 %x, i1 false) + %2 = icmp eq i16 %x, 0 + %3 = select i1 %2, i16 %y, i16 %1 + ret i16 %3 +} + +define i16 @cmov_bsf16_undef(i16 %x, i16 %y) nounwind { +; X86-LABEL: cmov_bsf16_undef: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testw %ax, %ax +; X86-NEXT: je .LBB3_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; X86-NEXT: .LBB3_1: +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: # kill: def $ax killed $ax killed $eax +; X86-NEXT: retl +; +; X64-LABEL: cmov_bsf16_undef: +; X64: # %bb.0: +; X64-NEXT: rep bsfl %edi, %eax +; X64-NEXT: testw %di, %di +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: # kill: def $ax killed $ax killed $eax +; X64-NEXT: retq + %1 = tail call i16 @llvm.cttz.i16(i16 %x, i1 true) + %2 = icmp eq i16 %x, 0 + %3 = select i1 %2, i16 %y, i16 %1 + ret i16 %3 +} + +define i32 @cmov_bsf32(i32 %x, i32 %y) nounwind { +; X86-LABEL: cmov_bsf32: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: je .LBB4_1 +; X86-NEXT: # %bb.2: # %cond.false +; X86-NEXT: rep bsfl %ecx, %eax +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: jne .LBB4_5 +; X86-NEXT: .LBB4_4: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: .LBB4_5: # %cond.end +; X86-NEXT: retl +; X86-NEXT: .LBB4_1: +; X86-NEXT: movl $32, %eax +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: je .LBB4_4 +; X86-NEXT: jmp .LBB4_5 +; +; X64-LABEL: cmov_bsf32: +; X64: # %bb.0: +; X64-NEXT: movl $32, %eax +; X64-NEXT: bsfl %edi, %eax +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: retq + %1 = tail call i32 @llvm.cttz.i32(i32 %x, i1 false) + %2 = icmp eq i32 %x, 0 + %3 = select i1 %2, i32 %y, i32 %1 + ret i32 %3 +} + +define i32 @cmov_bsf32_undef(i32 %x, i32 %y) nounwind { +; X86-LABEL: cmov_bsf32_undef: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: testl %eax, %eax +; X86-NEXT: je .LBB5_1 +; X86-NEXT: # %bb.2: +; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: retl +; X86-NEXT: .LBB5_1: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl +; +; X64-LABEL: cmov_bsf32_undef: +; X64: # %bb.0: +; X64-NEXT: bsfl %edi, %eax +; X64-NEXT: cmovel %esi, %eax +; X64-NEXT: retq + %1 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true) + %2 = icmp eq i32 %x, 0 + %3 = select i1 %2, i32 %y, i32 %1 + ret i32 %3 +} + +define i64 @cmov_bsf64(i64 %x, i64 %y) nounwind { +; X86-LABEL: cmov_bsf64: +; X86: # %bb.0: +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: movl %esi, %eax +; X86-NEXT: orl %ecx, %eax +; X86-NEXT: je .LBB6_1 +; X86-NEXT: # %bb.2: # %cond.false +; X86-NEXT: testl %esi, %esi +; X86-NEXT: jne .LBB6_3 +; X86-NEXT: # %bb.4: # %cond.false +; X86-NEXT: rep bsfl %ecx, %eax +; X86-NEXT: addl $32, %eax +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: je .LBB6_6 +; X86-NEXT: jmp .LBB6_7 +; X86-NEXT: .LBB6_1: +; X86-NEXT: movl $64, %eax +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: jne .LBB6_7 +; X86-NEXT: .LBB6_6: +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: .LBB6_7: # %cond.end +; X86-NEXT: popl %esi +; X86-NEXT: retl +; X86-NEXT: .LBB6_3: +; X86-NEXT: rep bsfl %esi, %eax +; X86-NEXT: orl %ecx, %esi +; X86-NEXT: je .LBB6_6 +; X86-NEXT: jmp .LBB6_7 +; +; X64-LABEL: cmov_bsf64: +; X64: # %bb.0: +; X64-NEXT: movl $64, %eax +; X64-NEXT: bsfq %rdi, %rax +; X64-NEXT: cmoveq %rsi, %rax +; X64-NEXT: retq + %1 = tail call i64 @llvm.cttz.i64(i64 %x, i1 false) + %2 = icmp eq i64 %x, 0 + %3 = select i1 %2, i64 %y, i64 %1 + ret i64 %3 +} + +define i64 @cmov_bsf64_undef(i64 %x, i64 %y) nounwind { +; X86-LABEL: cmov_bsf64_undef: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl %ecx, %edx +; X86-NEXT: orl %eax, %edx +; X86-NEXT: je .LBB7_5 +; X86-NEXT: # %bb.1: # %select.false.sink +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: jne .LBB7_2 +; X86-NEXT: # %bb.3: # %select.false.sink +; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: addl $32, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: retl +; X86-NEXT: .LBB7_5: # %select.end +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: retl +; X86-NEXT: .LBB7_2: +; X86-NEXT: rep bsfl %ecx, %eax +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: retl +; +; X64-LABEL: cmov_bsf64_undef: +; X64: # %bb.0: +; X64-NEXT: bsfq %rdi, %rax +; X64-NEXT: cmoveq %rsi, %rax +; X64-NEXT: retq + %1 = tail call i64 @llvm.cttz.i64(i64 %x, i1 true) + %2 = icmp eq i64 %x, 0 + %3 = select i1 %2, i64 %y, i64 %1 + ret i64 %3 +} + +define i128 @cmov_bsf128(i128 %x, i128 %y) nounwind { +; X86-LABEL: cmov_bsf128: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %edx +; X86-NEXT: orl %ebp, %edx +; X86-NEXT: movl %ecx, %esi +; X86-NEXT: orl %eax, %esi +; X86-NEXT: orl %edx, %esi +; X86-NEXT: je .LBB8_1 +; X86-NEXT: # %bb.2: # %cond.false +; X86-NEXT: testl %ecx, %ecx +; X86-NEXT: jne .LBB8_3 +; X86-NEXT: # %bb.4: # %cond.false +; X86-NEXT: rep bsfl %edi, %esi +; X86-NEXT: addl $32, %esi +; X86-NEXT: testl %eax, %eax +; X86-NEXT: je .LBB8_7 +; X86-NEXT: .LBB8_6: +; X86-NEXT: rep bsfl %eax, %edx +; X86-NEXT: jmp .LBB8_8 +; X86-NEXT: .LBB8_1: +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: xorl %ebp, %ebp +; X86-NEXT: movl $128, %esi +; X86-NEXT: jmp .LBB8_11 +; X86-NEXT: .LBB8_3: +; X86-NEXT: rep bsfl %ecx, %esi +; X86-NEXT: testl %eax, %eax +; X86-NEXT: jne .LBB8_6 +; X86-NEXT: .LBB8_7: # %cond.false +; X86-NEXT: rep bsfl %ebp, %edx +; X86-NEXT: addl $32, %edx +; X86-NEXT: .LBB8_8: # %cond.false +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: movl %ecx, %ebx +; X86-NEXT: orl %edi, %ebx +; X86-NEXT: jne .LBB8_10 +; X86-NEXT: # %bb.9: # %cond.false +; X86-NEXT: addl $64, %edx +; X86-NEXT: movl %edx, %esi +; X86-NEXT: .LBB8_10: # %cond.false +; X86-NEXT: xorl %ebp, %ebp +; X86-NEXT: .LBB8_11: # %cond.end +; X86-NEXT: xorl %ebx, %ebx +; X86-NEXT: xorl %edx, %edx +; X86-NEXT: orl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: orl %eax, %edi +; X86-NEXT: orl %ecx, %edi +; X86-NEXT: jne .LBB8_13 +; X86-NEXT: # %bb.12: +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: .LBB8_13: # %cond.end +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %edx, 12(%eax) +; X86-NEXT: movl %ebx, 8(%eax) +; X86-NEXT: movl %ebp, 4(%eax) +; X86-NEXT: movl %esi, (%eax) +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; +; X64-LABEL: cmov_bsf128: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: orq %rsi, %rax +; X64-NEXT: je .LBB8_2 +; X64-NEXT: # %bb.1: # %select.false.sink +; X64-NEXT: rep bsfq %rdi, %rcx +; X64-NEXT: movl $64, %eax +; X64-NEXT: rep bsfq %rsi, %rax +; X64-NEXT: addq $64, %rax +; X64-NEXT: testq %rdi, %rdi +; X64-NEXT: cmovneq %rcx, %rax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: retq +; X64-NEXT: .LBB8_2: # %select.end +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: retq + %1 = tail call i128 @llvm.cttz.i128(i128 %x, i1 false) + %2 = icmp eq i128 %x, 0 + %3 = select i1 %2, i128 %y, i128 %1 + ret i128 %3 +} + +define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind { +; X86-LABEL: cmov_bsf128_undef: +; X86: # %bb.0: +; X86-NEXT: pushl %ebp +; X86-NEXT: pushl %ebx +; X86-NEXT: pushl %edi +; X86-NEXT: pushl %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl %ecx, %edi +; X86-NEXT: orl %esi, %edi +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: orl %ebx, %ebp +; X86-NEXT: orl %edi, %ebp +; X86-NEXT: je .LBB9_11 +; X86-NEXT: # %bb.1: # %select.false.sink +; X86-NEXT: testl %edx, %edx +; X86-NEXT: jne .LBB9_2 +; X86-NEXT: # %bb.3: # %select.false.sink +; X86-NEXT: rep bsfl %ecx, %edi +; X86-NEXT: addl $32, %edi +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: je .LBB9_6 +; X86-NEXT: .LBB9_5: +; X86-NEXT: rep bsfl %ebx, %esi +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: je .LBB9_8 +; X86-NEXT: jmp .LBB9_9 +; X86-NEXT: .LBB9_11: # %select.end +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl %esi, 4(%eax) +; X86-NEXT: movl %edx, 8(%eax) +; X86-NEXT: movl %ecx, 12(%eax) +; X86-NEXT: jmp .LBB9_10 +; X86-NEXT: .LBB9_2: +; X86-NEXT: rep bsfl %edx, %edi +; X86-NEXT: testl %ebx, %ebx +; X86-NEXT: jne .LBB9_5 +; X86-NEXT: .LBB9_6: # %select.false.sink +; X86-NEXT: rep bsfl %esi, %esi +; X86-NEXT: addl $32, %esi +; X86-NEXT: orl %ecx, %edx +; X86-NEXT: jne .LBB9_9 +; X86-NEXT: .LBB9_8: # %select.false.sink +; X86-NEXT: addl $64, %esi +; X86-NEXT: movl %esi, %edi +; X86-NEXT: .LBB9_9: # %select.false.sink +; X86-NEXT: movl %edi, (%eax) +; X86-NEXT: movl $0, 12(%eax) +; X86-NEXT: movl $0, 8(%eax) +; X86-NEXT: movl $0, 4(%eax) +; X86-NEXT: .LBB9_10: # %select.false.sink +; X86-NEXT: popl %esi +; X86-NEXT: popl %edi +; X86-NEXT: popl %ebx +; X86-NEXT: popl %ebp +; X86-NEXT: retl $4 +; +; X64-LABEL: cmov_bsf128_undef: +; X64: # %bb.0: +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: orq %rsi, %rax +; X64-NEXT: je .LBB9_2 +; X64-NEXT: # %bb.1: # %select.false.sink +; X64-NEXT: rep bsfq %rdi, %rcx +; X64-NEXT: rep bsfq %rsi, %rax +; X64-NEXT: addq $64, %rax +; X64-NEXT: testq %rdi, %rdi +; X64-NEXT: cmovneq %rcx, %rax +; X64-NEXT: xorl %edx, %edx +; X64-NEXT: retq +; X64-NEXT: .LBB9_2: # %select.end +; X64-NEXT: movq %rdx, %rax +; X64-NEXT: movq %rcx, %rdx +; X64-NEXT: retq + %1 = tail call i128 @llvm.cttz.i128(i128 %x, i1 true) + %2 = icmp eq i128 %x, 0 + %3 = select i1 %2, i128 %y, i128 %1 + ret i128 %3 +} + +declare i8 @llvm.cttz.i8(i8, i1) +declare i16 @llvm.cttz.i16(i16, i1) +declare i32 @llvm.cttz.i32(i32, i1) +declare i64 @llvm.cttz.i64(i64, i1) +declare i128 @llvm.cttz.i128(i128, i1) From a72bcda1434c72f9db6687565a361479e0dde572 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 11 Jun 2025 08:24:10 +0100 Subject: [PATCH 038/851] [X86] add test coverage for #143606 --- .../X86/vector-shuffle-combining-avx512vl.ll | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll index 15c82f169c86e..d5aa7588925d8 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vl.ll @@ -137,3 +137,31 @@ define void @PR142995(ptr %p0, ptr %p1, ptr %p2) nounwind #0 { } declare <5 x i32> @llvm.masked.load.v5i32.p0(ptr captures(none), i32 immarg, <5 x i1>, <5 x i32>) declare <64 x i32> @llvm.masked.load.v64i32.p0(ptr captures(none), i32 immarg, <64 x i1>, <64 x i32>) + +define <8 x double> @PR143606(ptr %px, ptr %py) { +; X86-LABEL: PR143606: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: vmovapd (%ecx), %ymm0 +; X86-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0],mem[1,2],ymm0[3] +; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],mem[0],ymm0[2],mem[3] +; X86-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; X86-NEXT: retl +; +; X64-LABEL: PR143606: +; X64: # %bb.0: +; X64-NEXT: vmovapd (%rdi), %ymm0 +; X64-NEXT: vblendpd {{.*#+}} ymm1 = ymm0[0],mem[1,2],ymm0[3] +; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1],mem[0],ymm0[2],mem[3] +; X64-NEXT: vinsertf64x4 $1, %ymm0, %zmm1, %zmm0 +; X64-NEXT: retq + %x = load <4 x double>, ptr %px, align 32 + %y.lo = load <4 x double>, ptr %py, align 32 + %py.hi = getelementptr inbounds nuw i8, ptr %py, i64 32 + %y.hi = load <4 x double>, ptr %py.hi, align 32 + %lo = shufflevector <4 x double> %x, <4 x double> %y.lo, <4 x i32> + %hi = call <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double> %x, <4 x i64> , <4 x double> %y.hi) + %res = shufflevector <4 x double> %lo, <4 x double> %hi, <8 x i32> + ret <8 x double> %res +} From e9bd1aee6537508970614fd79a4f076ba4ed93d0 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Wed, 11 Jun 2025 08:30:09 +0100 Subject: [PATCH 039/851] [X86] bmi-select-distrib.ll - remove unused check prefixes and pull out PR comments above tests. NFC --- llvm/test/CodeGen/X86/bmi-select-distrib.ll | 31 +++++++++------------ 1 file changed, 13 insertions(+), 18 deletions(-) diff --git a/llvm/test/CodeGen/X86/bmi-select-distrib.ll b/llvm/test/CodeGen/X86/bmi-select-distrib.ll index 49beda516d508..e5696ded4fbf1 100644 --- a/llvm/test/CodeGen/X86/bmi-select-distrib.ll +++ b/llvm/test/CodeGen/X86/bmi-select-distrib.ll @@ -1,11 +1,11 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi | FileCheck %s --check-prefixes=X86,X86-BMI -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi,+bmi2 | FileCheck %s --check-prefixes=X86,X86-BMI2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=X64,X64-BMI2 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+cmov,+sse2,+bmi,+bmi2 | FileCheck %s --check-prefixes=X86 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi,+bmi2 | FileCheck %s --check-prefixes=X64 -define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind { ; PR131587 +define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind { ; X86-LABEL: and_select_neg_to_blsi1: ; X86: # %bb.0: ; X86-NEXT: blsil %eax, %ecx @@ -25,8 +25,8 @@ define i32 @and_select_neg_to_blsi1(i1 %a0, i32 inreg %a1) nounwind { ret i32 %ret } -define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind { ; PR131587 +define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind { ; X86-LABEL: and_select_neg_to_blsi2: ; X86: # %bb.0: ; X86-NEXT: blsil %eax, %ecx @@ -46,8 +46,8 @@ define i32 @and_select_neg_to_blsi2(i1 %a0, i32 inreg %a1) nounwind { ret i32 %ret } -define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind { ; PR131587 +define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind { ; X86-LABEL: and_select_neg_to_blsi3: ; X86: # %bb.0: ; X86-NEXT: blsil %eax, %ecx @@ -67,8 +67,8 @@ define i32 @and_select_neg_to_blsi3(i1 %a0, i32 inreg %a1) nounwind { ret i32 %ret } -define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind { ; PR131587 +define i64 @and_select_neg_to_blsi_i64(i1 %a0, i64 %a1) nounwind { ; X86-LABEL: and_select_neg_to_blsi_i64: ; X86: # %bb.0: ; X86-NEXT: pushl %esi @@ -283,8 +283,8 @@ define i32 @and_select_neg_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) no ret i32 %ret } -define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind { ; PR133848 +define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind { ; X86-LABEL: and_select_sub_1_to_blsr1: ; X86: # %bb.0: ; X86-NEXT: blsrl %eax, %ecx @@ -304,8 +304,8 @@ define i32 @and_select_sub_1_to_blsr1(i1 %a0, i32 inreg %a1) nounwind { ret i32 %ret } -define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind { ; PR133848 +define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind { ; X86-LABEL: and_select_sub_1_to_blsr2: ; X86: # %bb.0: ; X86-NEXT: blsrl %eax, %ecx @@ -325,8 +325,8 @@ define i32 @and_select_sub_1_to_blsr2(i1 %a0, i32 inreg %a1) nounwind { ret i32 %ret } -define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind { ; PR133848 +define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind { ; X86-LABEL: and_select_sub_1_to_blsr3: ; X86: # %bb.0: ; X86-NEXT: blsrl %eax, %ecx @@ -346,8 +346,8 @@ define i32 @and_select_sub_1_to_blsr3(i1 %a0, i32 inreg %a1) nounwind { ret i32 %ret } -define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind { ; PR133848 +define i32 @and_select_sub_1_to_blsr4(i1 %a0, i32 inreg %a1) nounwind { ; X86-LABEL: and_select_sub_1_to_blsr4: ; X86: # %bb.0: ; X86-NEXT: blsrl %eax, %ecx @@ -392,8 +392,8 @@ define i32 @and_sub_1_select_orig(i1 %a0, i32 inreg %a1) nounwind { ret i32 %ret } -define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind { ; PR133848 +define i64 @and_select_sub_1_to_blsr_i64(i1 %a0, i64 %a1) nounwind { ; X86-LABEL: and_select_sub_1_to_blsr_i64: ; X86: # %bb.0: ; X86-NEXT: pushl %esi @@ -863,8 +863,3 @@ define i32 @xor_select_sub_1_different_op(i1 %a0, i32 inreg %a1, i32 inreg %a2) %ret = xor i32 %a1, %bls ret i32 %ret } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; X64-BMI: {{.*}} -; X64-BMI2: {{.*}} -; X86-BMI: {{.*}} -; X86-BMI2: {{.*}} From 13115276d0d12b0d9bf952abdc19f04866db16a8 Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 11 Jun 2025 08:32:55 +0100 Subject: [PATCH 040/851] Revert "[AArch64][GlobalISel] Expand 64bit extracts to 128bit to allow more patterns (#142904)" This reverts commit 61cdba602abe67761ab2bbf12bf85710dfa963f4 due to verifier issues. --- .../AArch64/GISel/AArch64RegisterBankInfo.cpp | 32 +-- .../GlobalISel/regbank-extract-vector-elt.mir | 4 +- llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll | 3 - llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll | 3 - llvm/test/CodeGen/AArch64/abs.ll | 1 - llvm/test/CodeGen/AArch64/arm64-neon-copy.ll | 13 +- .../AArch64/arm64-neon-simd-ldst-one.ll | 45 ++-- llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll | 55 +++-- llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll | 1 - llvm/test/CodeGen/AArch64/bswap.ll | 1 - llvm/test/CodeGen/AArch64/concat-vector.ll | 7 +- llvm/test/CodeGen/AArch64/double_reduct.ll | 18 +- llvm/test/CodeGen/AArch64/f16-instructions.ll | 12 +- llvm/test/CodeGen/AArch64/faddsub.ll | 4 +- llvm/test/CodeGen/AArch64/fcopysign.ll | 4 +- llvm/test/CodeGen/AArch64/fcvt.ll | 14 +- llvm/test/CodeGen/AArch64/fdiv.ll | 2 +- llvm/test/CodeGen/AArch64/fminimummaximum.ll | 4 +- llvm/test/CodeGen/AArch64/fminmax.ll | 4 +- llvm/test/CodeGen/AArch64/fmla.ll | 6 +- llvm/test/CodeGen/AArch64/fmul.ll | 2 +- .../test/CodeGen/AArch64/fptosi-sat-vector.ll | 1 - .../test/CodeGen/AArch64/fptoui-sat-vector.ll | 1 - llvm/test/CodeGen/AArch64/fptrunc.ll | 4 +- llvm/test/CodeGen/AArch64/fsqrt.ll | 2 +- llvm/test/CodeGen/AArch64/insertextract.ll | 45 ++-- llvm/test/CodeGen/AArch64/itofp.ll | 20 +- llvm/test/CodeGen/AArch64/llvm.exp10.ll | 33 ++- llvm/test/CodeGen/AArch64/popcount.ll | 8 +- llvm/test/CodeGen/AArch64/ptradd.ll | 1 - llvm/test/CodeGen/AArch64/shift.ll | 6 - llvm/test/CodeGen/AArch64/store.ll | 15 +- .../AArch64/vec-combine-compare-to-bitmask.ll | 228 +++++++++++++----- .../CodeGen/AArch64/vecreduce-fadd-strict.ll | 7 +- .../vecreduce-fmax-legalization-nan.ll | 26 +- .../AArch64/vecreduce-fmax-legalization.ll | 26 +- .../CodeGen/AArch64/vecreduce-fmaximum.ll | 26 +- .../AArch64/vecreduce-fmin-legalization.ll | 26 +- .../CodeGen/AArch64/vecreduce-fminimum.ll | 26 +- .../CodeGen/AArch64/vecreduce-fmul-strict.ll | 29 ++- llvm/test/CodeGen/AArch64/vecreduce-fmul.ll | 121 ++++++---- .../AArch64/vecreduce-umax-legalization.ll | 15 +- llvm/test/CodeGen/AArch64/vector-lrint.ll | 25 +- 43 files changed, 592 insertions(+), 334 deletions(-) diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp index 53c7a00a7f9f0..31954e7954c03 100644 --- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp +++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp @@ -399,26 +399,6 @@ void AArch64RegisterBankInfo::applyMappingImpl( MI.getOperand(1).setReg(ConstReg); return applyDefaultMapping(OpdMapper); } - case TargetOpcode::G_EXTRACT_VECTOR_ELT: { - // SDAG will promote a 64bit G_EXTRACT_VECTOR_ELT to 128 to reduce the - // number of duplicate lane-extract patterns needed. Do the same here so - // that selection will operate on the larger vectors. - Register Src = MI.getOperand(1).getReg(); - LLT SrcTy = MRI.getType(Src); - assert(SrcTy.getSizeInBits() == 64 && "Expected 64-bit source vector"); - LLT DstTy = SrcTy.multiplyElements(2); - Builder.setInsertPt(*MI.getParent(), MI.getIterator()); - auto Undef = Builder.buildUndef(SrcTy); - auto Concat = Builder.buildConcatVectors(DstTy, {Src, Undef.getReg(0)}); - MRI.setRegBank(Undef.getReg(0), getRegBank(AArch64::FPRRegBankID)); - MRI.setRegBank(Concat.getReg(0), getRegBank(AArch64::FPRRegBankID)); - for (MachineInstr &Ext : - make_early_inc_range(MRI.use_nodbg_instructions(Src))) { - if (Ext.getOpcode() == TargetOpcode::G_EXTRACT_VECTOR_ELT) - Ext.getOperand(1).setReg(Concat.getReg(0)); - } - return applyDefaultMapping(OpdMapper); - } default: llvm_unreachable("Don't know how to handle that operation"); } @@ -1034,20 +1014,14 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { } break; } - case TargetOpcode::G_EXTRACT_VECTOR_ELT: { + case TargetOpcode::G_EXTRACT_VECTOR_ELT: // Destination and source need to be FPRs. OpRegBankIdx[0] = PMI_FirstFPR; OpRegBankIdx[1] = PMI_FirstFPR; - // Index needs to be a GPR constant. + + // Index needs to be a GPR. OpRegBankIdx[2] = PMI_FirstGPR; - // SDAG will promote a 64bit G_EXTRACT_VECTOR_ELT to 128 to reduce the - // number of duplicate lane-extract patterns needed. Do the same here so - // that selection will operate on the larger vectors. - LLT Ty = MRI.getType(MI.getOperand(1).getReg()); - if (!Ty.isScalable() && Ty.getSizeInBits() == 64) - MappingID = CustomMappingID; break; - } case TargetOpcode::G_INSERT_VECTOR_ELT: OpRegBankIdx[0] = PMI_FirstFPR; OpRegBankIdx[1] = PMI_FirstFPR; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir index 4e569e0bc7e5f..35bc36d472b1a 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir +++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-extract-vector-elt.mir @@ -94,9 +94,7 @@ body: | ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr(<4 x s16>) = COPY $d0 ; CHECK-NEXT: [[C:%[0-9]+]]:gpr(s64) = G_CONSTANT i64 1 - ; CHECK-NEXT: [[DEF:%[0-9]+]]:fpr(<4 x s16>) = G_IMPLICIT_DEF - ; CHECK-NEXT: [[CONCAT_VECTORS:%[0-9]+]]:fpr(<8 x s16>) = G_CONCAT_VECTORS [[COPY]](<4 x s16>), [[DEF]](<4 x s16>) - ; CHECK-NEXT: [[EVEC:%[0-9]+]]:fpr(s16) = G_EXTRACT_VECTOR_ELT [[CONCAT_VECTORS]](<8 x s16>), [[C]](s64) + ; CHECK-NEXT: [[EVEC:%[0-9]+]]:fpr(s16) = G_EXTRACT_VECTOR_ELT [[COPY]](<4 x s16>), [[C]](s64) ; CHECK-NEXT: $h0 = COPY [[EVEC]](s16) ; CHECK-NEXT: RET_ReallyLR implicit $h0 %0:_(<4 x s16>) = COPY $d0 diff --git a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll index 287344bdbd29f..7f922c0047553 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-bif-gen.ll @@ -70,9 +70,6 @@ define <1 x i32> @test_bitf_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) { ; ; CHECK-GI-LABEL: test_bitf_v1i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: fmov w10, s0 diff --git a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll index 73fcee56506f9..b8eb8269d605c 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-bit-gen.ll @@ -70,9 +70,6 @@ define <1 x i32> @test_bit_v1i32(<1 x i32> %A, <1 x i32> %B, <1 x i32> %C) { ; ; CHECK-GI-LABEL: test_bit_v1i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 -; CHECK-GI-NEXT: // kill: def $d2 killed $d2 def $q2 ; CHECK-GI-NEXT: fmov w8, s2 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: fmov w10, s0 diff --git a/llvm/test/CodeGen/AArch64/abs.ll b/llvm/test/CodeGen/AArch64/abs.ll index 470d68a805718..0f56d25a47b2a 100644 --- a/llvm/test/CodeGen/AArch64/abs.ll +++ b/llvm/test/CodeGen/AArch64/abs.ll @@ -243,7 +243,6 @@ define <1 x i32> @abs_v1i32(<1 x i32> %a){ ; ; CHECK-GI-LABEL: abs_v1i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: fmov w9, s0 ; CHECK-GI-NEXT: cmp w8, #0 diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll index 60af49d867be7..367105f783817 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll @@ -1215,7 +1215,6 @@ define <8 x i8> @testDUP.v1i8(<1 x i8> %a) { ; ; CHECK-GI-LABEL: testDUP.v1i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: dup v0.8b, w8 ; CHECK-GI-NEXT: ret @@ -1711,7 +1710,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v16i8(<8 x i8> %x, <16 x i8> %y) #0 { ; CHECK-GI-NEXT: mov v2.16b, v1.16b ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: adrp x8, .LCPI127_0 -; CHECK-GI-NEXT: mov b1, v0.b[0] +; CHECK-GI-NEXT: mov v1.b[0], v0.b[0] ; CHECK-GI-NEXT: mov v1.b[1], v0.b[1] ; CHECK-GI-NEXT: mov v1.b[2], v0.b[2] ; CHECK-GI-NEXT: mov v1.b[3], v0.b[3] @@ -1818,7 +1817,7 @@ define <16 x i8> @test_concat_v16i8_v8i8_v8i8(<8 x i8> %x, <8 x i8> %y) #0 { ; CHECK-GI-LABEL: test_concat_v16i8_v8i8_v8i8: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov b2, v0.b[0] +; CHECK-GI-NEXT: mov v2.b[0], v0.b[0] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: mov v2.b[1], v0.b[1] ; CHECK-GI-NEXT: mov v2.b[2], v0.b[2] @@ -1904,7 +1903,7 @@ define <8 x i16> @test_concat_v8i16_v4i16_v8i16(<4 x i16> %x, <8 x i16> %y) #0 { ; CHECK-GI-NEXT: mov v2.16b, v1.16b ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: adrp x8, .LCPI131_0 -; CHECK-GI-NEXT: mov h1, v0.h[0] +; CHECK-GI-NEXT: mov v1.h[0], v0.h[0] ; CHECK-GI-NEXT: mov v1.h[1], v0.h[1] ; CHECK-GI-NEXT: mov v1.h[2], v0.h[2] ; CHECK-GI-NEXT: mov v1.h[3], v0.h[3] @@ -1975,7 +1974,7 @@ define <8 x i16> @test_concat_v8i16_v4i16_v4i16(<4 x i16> %x, <4 x i16> %y) #0 { ; CHECK-GI-LABEL: test_concat_v8i16_v4i16_v4i16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov h2, v0.h[0] +; CHECK-GI-NEXT: mov v2.h[0], v0.h[0] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: mov v2.h[1], v0.h[1] ; CHECK-GI-NEXT: mov v2.h[2], v0.h[2] @@ -2037,7 +2036,7 @@ define <4 x i32> @test_concat_v4i32_v2i32_v4i32(<2 x i32> %x, <4 x i32> %y) #0 { ; CHECK-GI-NEXT: mov v2.16b, v1.16b ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: adrp x8, .LCPI135_0 -; CHECK-GI-NEXT: mov s1, v0.s[0] +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NEXT: ldr q0, [x8, :lo12:.LCPI135_0] ; CHECK-GI-NEXT: tbl v0.16b, { v1.16b, v2.16b }, v0.16b @@ -2243,7 +2242,6 @@ define <8 x i8> @concat_vector_v8i8(<1 x i8> %a) { ; ; CHECK-GI-LABEL: concat_vector_v8i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: dup v0.8b, w8 ; CHECK-GI-NEXT: ret @@ -2270,7 +2268,6 @@ define <16 x i8> @concat_vector_v16i8(<1 x i8> %a) { ; ; CHECK-GI-LABEL: concat_vector_v16i8: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: dup v0.16b, w8 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll index ac6f041ccd70d..f47c06e1ba4cb 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-simd-ldst-one.ll @@ -614,11 +614,16 @@ entry: } define void @test_vst1_lane0_s16(ptr %a, <4 x i16> %b) { -; CHECK-LABEL: test_vst1_lane0_s16: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: str h0, [x0] -; CHECK-NEXT: ret +; CHECK-GI-LABEL: test_vst1_lane0_s16: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str h0, [x0] +; CHECK-GI-NEXT: ret +; +; CHECK-SD-LABEL: test_vst1_lane0_s16: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str h0, [x0] +; CHECK-SD-NEXT: ret entry: %0 = extractelement <4 x i16> %b, i32 0 store i16 %0, ptr %a, align 2 @@ -638,11 +643,16 @@ entry: } define void @test_vst1_lane0_s32(ptr %a, <2 x i32> %b) { -; CHECK-LABEL: test_vst1_lane0_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: str s0, [x0] -; CHECK-NEXT: ret +; CHECK-GI-LABEL: test_vst1_lane0_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str s0, [x0] +; CHECK-GI-NEXT: ret +; +; CHECK-SD-LABEL: test_vst1_lane0_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: ret entry: %0 = extractelement <2 x i32> %b, i32 0 store i32 %0, ptr %a, align 4 @@ -673,11 +683,16 @@ entry: } define void @test_vst1_lane0_f32(ptr %a, <2 x float> %b) { -; CHECK-LABEL: test_vst1_lane0_f32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: str s0, [x0] -; CHECK-NEXT: ret +; CHECK-GI-LABEL: test_vst1_lane0_f32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: str s0, [x0] +; CHECK-GI-NEXT: ret +; +; CHECK-SD-LABEL: test_vst1_lane0_f32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: ret entry: %0 = extractelement <2 x float> %b, i32 0 store float %0, ptr %a, align 4 diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll index 1f8ac792d75f5..cb14adc00df00 100644 --- a/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll +++ b/llvm/test/CodeGen/AArch64/arm64-neon-v8.1a.ll @@ -663,14 +663,24 @@ entry: } define i32 @test_vqrdmlahs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) { -; CHECK-LABEL: test_vqrdmlahs_lane_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: fmov s2, w1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: sqrdmlah s1, s2, v0.s[1] -; CHECK-NEXT: fmov w0, s1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vqrdmlahs_lane_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov s1, w0 +; CHECK-SD-NEXT: fmov s2, w1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: sqrdmlah s1, s2, v0.s[1] +; CHECK-SD-NEXT: fmov w0, s1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vqrdmlahs_lane_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov s1, w0 +; CHECK-GI-NEXT: fmov s2, w1 +; CHECK-GI-NEXT: mov s0, v0.s[1] +; CHECK-GI-NEXT: sqrdmlah s1, s2, s0 +; CHECK-GI-NEXT: fmov w0, s1 +; CHECK-GI-NEXT: ret entry: %vget_lane = extractelement <2 x i32> %c, i64 1 %vqrdmlahs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlah.i32(i32 %a, i32 %b, i32 %vget_lane) #4 @@ -803,14 +813,24 @@ entry: } define i32 @test_vqrdmlshs_lane_s32(i32 %a, i32 %b, <2 x i32> %c) { -; CHECK-LABEL: test_vqrdmlshs_lane_s32: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: fmov s1, w0 -; CHECK-NEXT: fmov s2, w1 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: sqrdmlsh s1, s2, v0.s[1] -; CHECK-NEXT: fmov w0, s1 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_vqrdmlshs_lane_s32: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: fmov s1, w0 +; CHECK-SD-NEXT: fmov s2, w1 +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: sqrdmlsh s1, s2, v0.s[1] +; CHECK-SD-NEXT: fmov w0, s1 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_vqrdmlshs_lane_s32: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: fmov s1, w0 +; CHECK-GI-NEXT: fmov s2, w1 +; CHECK-GI-NEXT: mov s0, v0.s[1] +; CHECK-GI-NEXT: sqrdmlsh s1, s2, s0 +; CHECK-GI-NEXT: fmov w0, s1 +; CHECK-GI-NEXT: ret entry: %vget_lane = extractelement <2 x i32> %c, i64 1 %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vget_lane) #4 @@ -847,6 +867,3 @@ entry: %vqrdmlshs_s32.i = tail call i32 @llvm.aarch64.neon.sqrdmlsh.i32(i32 %a, i32 %b, i32 %vgetq_lane) #4 ret i32 %vqrdmlshs_s32.i } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-GI: {{.*}} -; CHECK-SD: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll index eccf918f74312..d4cc154ac6afc 100644 --- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll +++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll @@ -271,7 +271,6 @@ define half @test_vcvt_f16_f32(<1 x float> %x) { ; ; GISEL-LABEL: test_vcvt_f16_f32: ; GISEL: // %bb.0: -; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0 ; GISEL-NEXT: fcvt h0, s0 ; GISEL-NEXT: ret %tmp = fptrunc <1 x float> %x to <1 x half> diff --git a/llvm/test/CodeGen/AArch64/bswap.ll b/llvm/test/CodeGen/AArch64/bswap.ll index 9ae4782b52bd9..898958fb4993f 100644 --- a/llvm/test/CodeGen/AArch64/bswap.ll +++ b/llvm/test/CodeGen/AArch64/bswap.ll @@ -207,7 +207,6 @@ define <1 x i32> @bswap_v1i32(<1 x i32> %a){ ; ; CHECK-GI-LABEL: bswap_v1i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: rev w8, w8 ; CHECK-GI-NEXT: fmov s0, w8 diff --git a/llvm/test/CodeGen/AArch64/concat-vector.ll b/llvm/test/CodeGen/AArch64/concat-vector.ll index 1e8dd0c78043a..acf15f1bd1178 100644 --- a/llvm/test/CodeGen/AArch64/concat-vector.ll +++ b/llvm/test/CodeGen/AArch64/concat-vector.ll @@ -13,10 +13,11 @@ define <4 x i8> @concat1(<2 x i8> %A, <2 x i8> %B) { ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: mov w8, v0.s[1] ; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-GI-NEXT: mov w9, v1.s[1] ; CHECK-GI-NEXT: mov v0.h[1], w8 -; CHECK-GI-NEXT: mov w8, v1.s[1] -; CHECK-GI-NEXT: mov v0.h[2], v1.h[0] -; CHECK-GI-NEXT: mov v0.h[3], w8 +; CHECK-GI-NEXT: fmov w8, s1 +; CHECK-GI-NEXT: mov v0.h[2], w8 +; CHECK-GI-NEXT: mov v0.h[3], w9 ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0 ; CHECK-GI-NEXT: ret %v4i8 = shufflevector <2 x i8> %A, <2 x i8> %B, <4 x i32> diff --git a/llvm/test/CodeGen/AArch64/double_reduct.ll b/llvm/test/CodeGen/AArch64/double_reduct.ll index 2d146bf9aae89..f30895db2c098 100644 --- a/llvm/test/CodeGen/AArch64/double_reduct.ll +++ b/llvm/test/CodeGen/AArch64/double_reduct.ll @@ -65,8 +65,10 @@ define float @fmul_f32(<8 x float> %a, <4 x float> %b) { ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s ; CHECK-GI-NEXT: fmul v1.2s, v2.2s, v3.2s -; CHECK-GI-NEXT: fmul s0, s0, v0.s[1] -; CHECK-GI-NEXT: fmul s1, s1, v1.s[1] +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov s3, v1.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s2 +; CHECK-GI-NEXT: fmul s1, s1, s3 ; CHECK-GI-NEXT: fmul s0, s0, s1 ; CHECK-GI-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a) @@ -90,8 +92,10 @@ define float @fmul_f32_same(<4 x float> %a, <4 x float> %b) { ; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s ; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s -; CHECK-GI-NEXT: fmul s0, s0, v0.s[1] -; CHECK-GI-NEXT: fmul s1, s1, v1.s[1] +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov s3, v1.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s2 +; CHECK-GI-NEXT: fmul s1, s1, s3 ; CHECK-GI-NEXT: fmul s0, s0, s1 ; CHECK-GI-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) @@ -918,8 +922,10 @@ define float @nested_mul_f32(<4 x float> %a, <4 x float> %b, float %c, float %d) ; CHECK-GI-NEXT: mov d5, v1.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v4.2s ; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v5.2s -; CHECK-GI-NEXT: fmul s0, s0, v0.s[1] -; CHECK-GI-NEXT: fmul s1, s1, v1.s[1] +; CHECK-GI-NEXT: mov s4, v0.s[1] +; CHECK-GI-NEXT: mov s5, v1.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s4 +; CHECK-GI-NEXT: fmul s1, s1, s5 ; CHECK-GI-NEXT: fmul s0, s0, s2 ; CHECK-GI-NEXT: fmul s1, s1, s3 ; CHECK-GI-NEXT: fmul s0, s0, s1 diff --git a/llvm/test/CodeGen/AArch64/f16-instructions.ll b/llvm/test/CodeGen/AArch64/f16-instructions.ll index aa120f2643950..adc536da26f26 100644 --- a/llvm/test/CodeGen/AArch64/f16-instructions.ll +++ b/llvm/test/CodeGen/AArch64/f16-instructions.ll @@ -1496,7 +1496,7 @@ define half @test_copysign(half %a, half %b) #0 { ; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-GI-NEXT: // kill: def $h1 killed $h1 def $d1 ; CHECK-CVT-GI-NEXT: bif v0.8b, v1.8b, v2.8b -; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 ; CHECK-CVT-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: test_copysign: @@ -1505,7 +1505,7 @@ define half @test_copysign(half %a, half %b) #0 { ; CHECK-FP16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-FP16-GI-NEXT: // kill: def $h1 killed $h1 def $d1 ; CHECK-FP16-GI-NEXT: bif v0.8b, v1.8b, v2.8b -; CHECK-FP16-GI-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-FP16-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 ; CHECK-FP16-GI-NEXT: ret %r = call half @llvm.copysign.f16(half %a, half %b) ret half %r @@ -1536,7 +1536,7 @@ define half @test_copysign_f32(half %a, float %b) #0 { ; CHECK-CVT-GI-NEXT: mvni v2.4h, #128, lsl #8 ; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-GI-NEXT: bif v0.8b, v1.8b, v2.8b -; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 ; CHECK-CVT-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: test_copysign_f32: @@ -1545,7 +1545,7 @@ define half @test_copysign_f32(half %a, float %b) #0 { ; CHECK-FP16-GI-NEXT: mvni v2.4h, #128, lsl #8 ; CHECK-FP16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-FP16-GI-NEXT: bif v0.8b, v1.8b, v2.8b -; CHECK-FP16-GI-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-FP16-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 ; CHECK-FP16-GI-NEXT: ret %tb = fptrunc float %b to half %r = call half @llvm.copysign.f16(half %a, half %tb) @@ -1577,7 +1577,7 @@ define half @test_copysign_f64(half %a, double %b) #0 { ; CHECK-CVT-GI-NEXT: mvni v2.4h, #128, lsl #8 ; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-CVT-GI-NEXT: bif v0.8b, v1.8b, v2.8b -; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-CVT-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 ; CHECK-CVT-GI-NEXT: ret ; ; CHECK-FP16-GI-LABEL: test_copysign_f64: @@ -1586,7 +1586,7 @@ define half @test_copysign_f64(half %a, double %b) #0 { ; CHECK-FP16-GI-NEXT: mvni v2.4h, #128, lsl #8 ; CHECK-FP16-GI-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-FP16-GI-NEXT: bif v0.8b, v1.8b, v2.8b -; CHECK-FP16-GI-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-FP16-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 ; CHECK-FP16-GI-NEXT: ret %tb = fptrunc double %b to half %r = call half @llvm.copysign.f16(half %a, half %tb) diff --git a/llvm/test/CodeGen/AArch64/faddsub.ll b/llvm/test/CodeGen/AArch64/faddsub.ll index 943073e2a603e..b15579199a059 100644 --- a/llvm/test/CodeGen/AArch64/faddsub.ll +++ b/llvm/test/CodeGen/AArch64/faddsub.ll @@ -196,7 +196,7 @@ define <7 x half> @fadd_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s ; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6] ; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6] -; CHECK-GI-NOFP16-NEXT: mov h0, v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1] @@ -537,7 +537,7 @@ define <7 x half> @fsub_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s ; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6] ; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6] -; CHECK-GI-NOFP16-NEXT: mov h0, v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1] diff --git a/llvm/test/CodeGen/AArch64/fcopysign.ll b/llvm/test/CodeGen/AArch64/fcopysign.ll index 7ac1f37af2e0b..3a5f7e2cd6b29 100644 --- a/llvm/test/CodeGen/AArch64/fcopysign.ll +++ b/llvm/test/CodeGen/AArch64/fcopysign.ll @@ -33,7 +33,7 @@ define float @copysign_f32(float %a, float %b) { ; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $d0 ; CHECK-GI-NEXT: // kill: def $s1 killed $s1 def $d1 ; CHECK-GI-NEXT: bif v0.8b, v1.8b, v2.8b -; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $d0 ; CHECK-GI-NEXT: ret entry: %c = call float @llvm.copysign.f32(float %a, float %b) @@ -56,7 +56,7 @@ define half @copysign_f16(half %a, half %b) { ; CHECK-GI-NEXT: // kill: def $h0 killed $h0 def $d0 ; CHECK-GI-NEXT: // kill: def $h1 killed $h1 def $d1 ; CHECK-GI-NEXT: bif v0.8b, v1.8b, v2.8b -; CHECK-GI-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 ; CHECK-GI-NEXT: ret entry: %c = call half @llvm.copysign.f16(half %a, half %b) diff --git a/llvm/test/CodeGen/AArch64/fcvt.ll b/llvm/test/CodeGen/AArch64/fcvt.ll index 2c512de413aeb..b408e9c1bd4e6 100644 --- a/llvm/test/CodeGen/AArch64/fcvt.ll +++ b/llvm/test/CodeGen/AArch64/fcvt.ll @@ -169,7 +169,7 @@ define <7 x half> @ceil_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov h0, v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] ; CHECK-GI-NOFP16-NEXT: frintp v2.4s, v2.4s @@ -468,7 +468,7 @@ define <7 x half> @floor_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov h0, v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] ; CHECK-GI-NOFP16-NEXT: frintm v2.4s, v2.4s @@ -767,7 +767,7 @@ define <7 x half> @nearbyint_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov h0, v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] ; CHECK-GI-NOFP16-NEXT: frinti v2.4s, v2.4s @@ -1066,7 +1066,7 @@ define <7 x half> @roundeven_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov h0, v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] ; CHECK-GI-NOFP16-NEXT: frintn v2.4s, v2.4s @@ -1365,7 +1365,7 @@ define <7 x half> @rint_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov h0, v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] ; CHECK-GI-NOFP16-NEXT: frintx v2.4s, v2.4s @@ -1664,7 +1664,7 @@ define <7 x half> @round_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov h0, v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] ; CHECK-GI-NOFP16-NEXT: frinta v2.4s, v2.4s @@ -1963,7 +1963,7 @@ define <7 x half> @trunc_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-NEXT: mov v2.h[1], v0.h[5] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: mov v2.h[2], v0.h[6] -; CHECK-GI-NOFP16-NEXT: mov h0, v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v2.4h ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] ; CHECK-GI-NOFP16-NEXT: frintz v2.4s, v2.4s diff --git a/llvm/test/CodeGen/AArch64/fdiv.ll b/llvm/test/CodeGen/AArch64/fdiv.ll index d232ca4d9c131..5bdccccc62b99 100644 --- a/llvm/test/CodeGen/AArch64/fdiv.ll +++ b/llvm/test/CodeGen/AArch64/fdiv.ll @@ -199,7 +199,7 @@ define <7 x half> @fdiv_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v3.4h ; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s ; CHECK-GI-NOFP16-NEXT: fdiv v1.4s, v0.4s, v1.4s -; CHECK-GI-NOFP16-NEXT: mov h0, v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0] ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1] ; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v2.h[2] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s diff --git a/llvm/test/CodeGen/AArch64/fminimummaximum.ll b/llvm/test/CodeGen/AArch64/fminimummaximum.ll index 1c7c55d12a864..fb12f8acf1745 100644 --- a/llvm/test/CodeGen/AArch64/fminimummaximum.ll +++ b/llvm/test/CodeGen/AArch64/fminimummaximum.ll @@ -672,7 +672,7 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s ; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6] ; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6] -; CHECK-NOFP16-GI-NEXT: mov h0, v2.h[0] +; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0] ; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h ; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h ; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1] @@ -770,7 +770,7 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s ; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6] ; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6] -; CHECK-NOFP16-GI-NEXT: mov h0, v2.h[0] +; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0] ; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h ; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h ; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1] diff --git a/llvm/test/CodeGen/AArch64/fminmax.ll b/llvm/test/CodeGen/AArch64/fminmax.ll index da9b57223cff7..64f0da8b4cd0f 100644 --- a/llvm/test/CodeGen/AArch64/fminmax.ll +++ b/llvm/test/CodeGen/AArch64/fminmax.ll @@ -672,7 +672,7 @@ define <7 x half> @min_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s ; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6] ; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6] -; CHECK-NOFP16-GI-NEXT: mov h0, v2.h[0] +; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0] ; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h ; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h ; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1] @@ -770,7 +770,7 @@ define <7 x half> @max_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-NOFP16-GI-NEXT: fcvtn v2.4h, v2.4s ; CHECK-NOFP16-GI-NEXT: mov v4.h[2], v0.h[6] ; CHECK-NOFP16-GI-NEXT: mov v3.h[2], v1.h[6] -; CHECK-NOFP16-GI-NEXT: mov h0, v2.h[0] +; CHECK-NOFP16-GI-NEXT: mov v0.h[0], v2.h[0] ; CHECK-NOFP16-GI-NEXT: fcvtl v1.4s, v4.4h ; CHECK-NOFP16-GI-NEXT: fcvtl v3.4s, v3.4h ; CHECK-NOFP16-GI-NEXT: mov v0.h[1], v2.h[1] diff --git a/llvm/test/CodeGen/AArch64/fmla.ll b/llvm/test/CodeGen/AArch64/fmla.ll index ef59209b69921..a37aabb0b5384 100644 --- a/llvm/test/CodeGen/AArch64/fmla.ll +++ b/llvm/test/CodeGen/AArch64/fmla.ll @@ -268,7 +268,7 @@ define <7 x half> @fma_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) { ; CHECK-GI-NOFP16-NEXT: mov v6.h[2], v0.h[6] ; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6] ; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v2.h[6] -; CHECK-GI-NOFP16-NEXT: mov h0, v5.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v5.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v6.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v3.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v4.4h @@ -873,7 +873,7 @@ define <7 x half> @fmuladd_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) { ; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v0.4s ; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[6] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov h0, v3.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v5.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1] @@ -1358,7 +1358,7 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b, <7 x half> %c) { ; CHECK-GI-NOFP16-NEXT: fcvtn v3.4h, v0.4s ; CHECK-GI-NOFP16-NEXT: mov v5.h[2], v2.h[6] ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s -; CHECK-GI-NOFP16-NEXT: mov h0, v3.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v3.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v2.4s, v5.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v1.4h ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v3.h[1] diff --git a/llvm/test/CodeGen/AArch64/fmul.ll b/llvm/test/CodeGen/AArch64/fmul.ll index 51eba5666f681..bd3d1353e643e 100644 --- a/llvm/test/CodeGen/AArch64/fmul.ll +++ b/llvm/test/CodeGen/AArch64/fmul.ll @@ -196,7 +196,7 @@ define <7 x half> @fmul_v7f16(<7 x half> %a, <7 x half> %b) { ; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s ; CHECK-GI-NOFP16-NEXT: mov v4.h[2], v0.h[6] ; CHECK-GI-NOFP16-NEXT: mov v3.h[2], v1.h[6] -; CHECK-GI-NOFP16-NEXT: mov h0, v2.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v2.h[0] ; CHECK-GI-NOFP16-NEXT: fcvtl v1.4s, v4.4h ; CHECK-GI-NOFP16-NEXT: fcvtl v3.4s, v3.4h ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v2.h[1] diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll index bcebbf4982eaa..9c21d2bf083a2 100644 --- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -31,7 +31,6 @@ define <1 x i32> @test_signed_v1f32_v1i32(<1 x float> %f) { ; ; CHECK-GI-LABEL: test_signed_v1f32_v1i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: fcvtzs w8, s0 ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll index 38895eb7bd761..44847a41287d6 100644 --- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -31,7 +31,6 @@ define <1 x i32> @test_unsigned_v1f32_v1i32(<1 x float> %f) { ; ; CHECK-GI-LABEL: test_unsigned_v1f32_v1i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 ; CHECK-GI-NEXT: fcvtzu w8, s0 ; CHECK-GI-NEXT: fmov s0, w8 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fptrunc.ll b/llvm/test/CodeGen/AArch64/fptrunc.ll index a428c95c90387..1f84c944d7c16 100644 --- a/llvm/test/CodeGen/AArch64/fptrunc.ll +++ b/llvm/test/CodeGen/AArch64/fptrunc.ll @@ -263,7 +263,7 @@ define <3 x float> @fptrunc_v3f64_v3f32(<3 x double> %a) { ; CHECK-GI-NEXT: fcvt s2, d2 ; CHECK-GI-NEXT: mov v0.d[1], v1.d[0] ; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d -; CHECK-GI-NEXT: mov s0, v1.s[0] +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] ; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] ; CHECK-GI-NEXT: mov v0.s[2], v2.s[0] ; CHECK-GI-NEXT: ret @@ -354,7 +354,7 @@ define <2 x half> @fptrunc_v2f32_v2f16(<2 x float> %a) { ; CHECK-GI-LABEL: fptrunc_v2f32_v2f16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: mov s1, v0.s[0] +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/fsqrt.ll b/llvm/test/CodeGen/AArch64/fsqrt.ll index 1e888a4c0e193..6c5fd8e52b017 100644 --- a/llvm/test/CodeGen/AArch64/fsqrt.ll +++ b/llvm/test/CodeGen/AArch64/fsqrt.ll @@ -203,7 +203,7 @@ define <7 x half> @sqrt_v7f16(<7 x half> %a) { ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v2.4h ; CHECK-GI-NOFP16-NEXT: fcvtn v1.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: fsqrt v2.4s, v0.4s -; CHECK-GI-NOFP16-NEXT: mov h0, v1.h[0] +; CHECK-GI-NOFP16-NEXT: mov v0.h[0], v1.h[0] ; CHECK-GI-NOFP16-NEXT: mov v0.h[1], v1.h[1] ; CHECK-GI-NOFP16-NEXT: mov v0.h[2], v1.h[2] ; CHECK-GI-NOFP16-NEXT: fcvtn v2.4h, v2.4s diff --git a/llvm/test/CodeGen/AArch64/insertextract.ll b/llvm/test/CodeGen/AArch64/insertextract.ll index 1af36ccaefa30..5c89316e5f570 100644 --- a/llvm/test/CodeGen/AArch64/insertextract.ll +++ b/llvm/test/CodeGen/AArch64/insertextract.ll @@ -1478,11 +1478,16 @@ entry: } define float @extract_v2f32_0(<2 x float> %a, i32 %c) { -; CHECK-LABEL: extract_v2f32_0: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extract_v2f32_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2f32_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $d0 +; CHECK-GI-NEXT: ret entry: %d = extractelement <2 x float> %a, i32 0 ret float %d @@ -1681,11 +1686,16 @@ entry: } define half @extract_v4f16_0(<4 x half> %a, i32 %c) { -; CHECK-LABEL: extract_v4f16_0: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: // kill: def $h0 killed $h0 killed $q0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extract_v4f16_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: // kill: def $h0 killed $h0 killed $q0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v4f16_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: // kill: def $h0 killed $h0 killed $d0 +; CHECK-GI-NEXT: ret entry: %d = extractelement <4 x half> %a, i32 0 ret half %d @@ -2149,11 +2159,16 @@ entry: } define i32 @extract_v2i32_0(<2 x i32> %a, i32 %c) { -; CHECK-LABEL: extract_v2i32_0: -; CHECK: // %bb.0: // %entry -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: extract_v2i32_0: +; CHECK-SD: // %bb.0: // %entry +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: extract_v2i32_0: +; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret entry: %d = extractelement <2 x i32> %a, i32 0 ret i32 %d diff --git a/llvm/test/CodeGen/AArch64/itofp.ll b/llvm/test/CodeGen/AArch64/itofp.ll index 5ec30b6e8a667..e8194b9bd9b27 100644 --- a/llvm/test/CodeGen/AArch64/itofp.ll +++ b/llvm/test/CodeGen/AArch64/itofp.ll @@ -4378,7 +4378,7 @@ define <3 x float> @stofp_v3i64_v3f32(<3 x i64> %a) { ; CHECK-GI-NEXT: scvtf v0.2d, v0.2d ; CHECK-GI-NEXT: fcvtn v2.2s, v2.2d ; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d -; CHECK-GI-NEXT: mov s0, v1.s[0] +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] ; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] ; CHECK-GI-NEXT: mov v0.s[2], v2.s[0] ; CHECK-GI-NEXT: ret @@ -4415,7 +4415,7 @@ define <3 x float> @utofp_v3i64_v3f32(<3 x i64> %a) { ; CHECK-GI-NEXT: ucvtf v0.2d, v0.2d ; CHECK-GI-NEXT: fcvtn v2.2s, v2.2d ; CHECK-GI-NEXT: fcvtn v1.2s, v0.2d -; CHECK-GI-NEXT: mov s0, v1.s[0] +; CHECK-GI-NEXT: mov v0.s[0], v1.s[0] ; CHECK-GI-NEXT: mov v0.s[1], v1.s[1] ; CHECK-GI-NEXT: mov v0.s[2], v2.s[0] ; CHECK-GI-NEXT: ret @@ -6393,7 +6393,7 @@ define <2 x half> @stofp_v2i64_v2f16(<2 x i64> %a) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: scvtf v0.2d, v0.2d ; CHECK-GI-NOFP16-NEXT: fcvtn v0.2s, v0.2d -; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[0] +; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: ret @@ -6439,7 +6439,7 @@ define <2 x half> @utofp_v2i64_v2f16(<2 x i64> %a) { ; CHECK-GI-NOFP16: // %bb.0: // %entry ; CHECK-GI-NOFP16-NEXT: ucvtf v0.2d, v0.2d ; CHECK-GI-NOFP16-NEXT: fcvtn v0.2s, v0.2d -; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[0] +; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: ret @@ -7375,7 +7375,7 @@ define <2 x half> @stofp_v2i32_v2f16(<2 x i32> %a) { ; CHECK-GI-LABEL: stofp_v2i32_v2f16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: scvtf v0.2s, v0.2s -; CHECK-GI-NEXT: mov s1, v0.s[0] +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NEXT: ret @@ -7395,7 +7395,7 @@ define <2 x half> @utofp_v2i32_v2f16(<2 x i32> %a) { ; CHECK-GI-LABEL: utofp_v2i32_v2f16: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ucvtf v0.2s, v0.2s -; CHECK-GI-NEXT: mov s1, v0.s[0] +; CHECK-GI-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NEXT: ret @@ -7602,7 +7602,7 @@ define <2 x half> @stofp_v2i16_v2f16(<2 x i16> %a) { ; CHECK-GI-NOFP16-NEXT: shl v0.2s, v0.2s, #16 ; CHECK-GI-NOFP16-NEXT: sshr v0.2s, v0.2s, #16 ; CHECK-GI-NOFP16-NEXT: scvtf v0.2s, v0.2s -; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[0] +; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: ret @@ -7637,7 +7637,7 @@ define <2 x half> @utofp_v2i16_v2f16(<2 x i16> %a) { ; CHECK-GI-NOFP16-NEXT: movi d1, #0x00ffff0000ffff ; CHECK-GI-NOFP16-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NOFP16-NEXT: ucvtf v0.2s, v0.2s -; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[0] +; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: ret @@ -8124,7 +8124,7 @@ define <2 x half> @stofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-GI-NOFP16-NEXT: shl v0.2s, v0.2s, #24 ; CHECK-GI-NOFP16-NEXT: sshr v0.2s, v0.2s, #24 ; CHECK-GI-NOFP16-NEXT: scvtf v0.2s, v0.2s -; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[0] +; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: ret @@ -8175,7 +8175,7 @@ define <2 x half> @utofp_v2i8_v2f16(<2 x i8> %a) { ; CHECK-GI-NOFP16-NEXT: movi d1, #0x0000ff000000ff ; CHECK-GI-NOFP16-NEXT: and v0.8b, v0.8b, v1.8b ; CHECK-GI-NOFP16-NEXT: ucvtf v0.2s, v0.2s -; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[0] +; CHECK-GI-NOFP16-NEXT: mov v1.s[0], v0.s[0] ; CHECK-GI-NOFP16-NEXT: mov v1.s[1], v0.s[1] ; CHECK-GI-NOFP16-NEXT: fcvtn v0.4h, v1.4s ; CHECK-GI-NOFP16-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/llvm.exp10.ll b/llvm/test/CodeGen/AArch64/llvm.exp10.ll index 9d165556f1c73..c1ea891bc86e7 100644 --- a/llvm/test/CodeGen/AArch64/llvm.exp10.ll +++ b/llvm/test/CodeGen/AArch64/llvm.exp10.ll @@ -301,17 +301,28 @@ define float @exp10_f32(float %x) { } define <1 x float> @exp10_v1f32(<1 x float> %x) { -; CHECK-LABEL: exp10_v1f32: -; CHECK: // %bb.0: -; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: .cfi_def_cfa_offset 16 -; CHECK-NEXT: .cfi_offset w30, -16 -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: bl exp10f -; CHECK-NEXT: // kill: def $s0 killed $s0 def $d0 -; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload -; CHECK-NEXT: ret +; SDAG-LABEL: exp10_v1f32: +; SDAG: // %bb.0: +; SDAG-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; SDAG-NEXT: .cfi_def_cfa_offset 16 +; SDAG-NEXT: .cfi_offset w30, -16 +; SDAG-NEXT: // kill: def $d0 killed $d0 def $q0 +; SDAG-NEXT: // kill: def $s0 killed $s0 killed $q0 +; SDAG-NEXT: bl exp10f +; SDAG-NEXT: // kill: def $s0 killed $s0 def $d0 +; SDAG-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; SDAG-NEXT: ret +; +; GISEL-LABEL: exp10_v1f32: +; GISEL: // %bb.0: +; GISEL-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; GISEL-NEXT: .cfi_def_cfa_offset 16 +; GISEL-NEXT: .cfi_offset w30, -16 +; GISEL-NEXT: // kill: def $s0 killed $s0 killed $d0 +; GISEL-NEXT: bl exp10f +; GISEL-NEXT: // kill: def $s0 killed $s0 def $d0 +; GISEL-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; GISEL-NEXT: ret %r = call <1 x float> @llvm.exp10.v1f32(<1 x float> %x) ret <1 x float> %r } diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll index eded13a6b3669..c158d8ad93b05 100644 --- a/llvm/test/CodeGen/AArch64/popcount.ll +++ b/llvm/test/CodeGen/AArch64/popcount.ll @@ -655,9 +655,7 @@ define i32 @ctpop_into_extract(ptr %p) { ; CHECKO0-NEXT: // implicit-def: $d2 ; CHECKO0-NEXT: fmov s2, w8 ; CHECKO0-NEXT: ldr d0, [x0] -; CHECKO0-NEXT: // implicit-def: $q1 -; CHECKO0-NEXT: fmov d1, d0 -; CHECKO0-NEXT: // kill: def $s1 killed $s1 killed $q1 +; CHECKO0-NEXT: fmov s1, s0 ; CHECKO0-NEXT: fmov w8, s1 ; CHECKO0-NEXT: fmov s1, w8 ; CHECKO0-NEXT: // kill: def $d1 killed $s1 @@ -727,9 +725,7 @@ define i32 @ctpop_into_extract(ptr %p) { ; GISELO0-NEXT: // implicit-def: $d2 ; GISELO0-NEXT: fmov s2, w8 ; GISELO0-NEXT: ldr d0, [x0] -; GISELO0-NEXT: // implicit-def: $q1 -; GISELO0-NEXT: fmov d1, d0 -; GISELO0-NEXT: // kill: def $s1 killed $s1 killed $q1 +; GISELO0-NEXT: fmov s1, s0 ; GISELO0-NEXT: fmov w8, s1 ; GISELO0-NEXT: fmov s1, w8 ; GISELO0-NEXT: // kill: def $d1 killed $s1 diff --git a/llvm/test/CodeGen/AArch64/ptradd.ll b/llvm/test/CodeGen/AArch64/ptradd.ll index 4a1c50b67ed7b..28a8f4303765b 100644 --- a/llvm/test/CodeGen/AArch64/ptradd.ll +++ b/llvm/test/CodeGen/AArch64/ptradd.ll @@ -51,7 +51,6 @@ define <1 x ptr> @vector_gep_v1i32(<1 x ptr> %b, <1 x i32> %off) { ; ; CHECK-GI-LABEL: vector_gep_v1i32: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: fmov w8, s1 ; CHECK-GI-NEXT: fmov x9, d0 ; CHECK-GI-NEXT: add x8, x9, w8, sxtw diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll index 1652eb70b0625..9827cb3526f99 100644 --- a/llvm/test/CodeGen/AArch64/shift.ll +++ b/llvm/test/CodeGen/AArch64/shift.ll @@ -595,8 +595,6 @@ define <1 x i32> @shl_v1i32(<1 x i32> %0, <1 x i32> %1){ ; ; CHECK-GI-LABEL: shl_v1i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: lsl w8, w8, w9 @@ -773,8 +771,6 @@ define <1 x i32> @ashr_v1i32(<1 x i32> %0, <1 x i32> %1){ ; ; CHECK-GI-LABEL: ashr_v1i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: asr w8, w8, w9 @@ -947,8 +943,6 @@ define <1 x i32> @lshr_v1i32(<1 x i32> %0, <1 x i32> %1){ ; ; CHECK-GI-LABEL: lshr_v1i32: ; CHECK-GI: // %bb.0: -; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-NEXT: // kill: def $d1 killed $d1 def $q1 ; CHECK-GI-NEXT: fmov w8, s0 ; CHECK-GI-NEXT: fmov w9, s1 ; CHECK-GI-NEXT: lsr w8, w8, w9 diff --git a/llvm/test/CodeGen/AArch64/store.ll b/llvm/test/CodeGen/AArch64/store.ll index 0fe1ef5039929..3a9f12b838702 100644 --- a/llvm/test/CodeGen/AArch64/store.ll +++ b/llvm/test/CodeGen/AArch64/store.ll @@ -167,11 +167,16 @@ define void @store_v16i16(<16 x i16> %a, ptr %ptr){ } define void @store_v1i32(<1 x i32> %a, ptr %ptr){ -; CHECK-LABEL: store_v1i32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: str s0, [x0] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: store_v1i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: str s0, [x0] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: store_v1i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: str s0, [x0] +; CHECK-GI-NEXT: ret store <1 x i32> %a, ptr %ptr ret void } diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll index 63e26a25f4e27..77483ebb2235c 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -930,85 +930,195 @@ define <2 x i8> @vector_to_vector_cast(<16 x i1> %arg) nounwind { ; CHECK-GI-LABEL: vector_to_vector_cast: ; CHECK-GI: ; %bb.0: ; CHECK-GI-NEXT: sub sp, sp, #16 -; CHECK-GI-NEXT: umov.b w10, v0[1] -; CHECK-GI-NEXT: umov.b w9, v0[1] +; CHECK-GI-NEXT: umov.b w8, v0[1] ; CHECK-GI-NEXT: mov d1, v0[1] -; CHECK-GI-NEXT: umov.b w8, v0[0] -; CHECK-GI-NEXT: umov.b w11, v0[0] -; CHECK-GI-NEXT: umov.b w12, v0[2] -; CHECK-GI-NEXT: umov.b w13, v0[2] +; CHECK-GI-NEXT: umov.b w10, v0[1] +; CHECK-GI-NEXT: umov.b w9, v0[0] +; CHECK-GI-NEXT: umov.b w13, v0[0] +; CHECK-GI-NEXT: umov.b w14, v0[2] ; CHECK-GI-NEXT: umov.b w15, v0[3] +; CHECK-GI-NEXT: umov.b w11, v0[2] ; CHECK-GI-NEXT: umov.b w16, v0[4] -; CHECK-GI-NEXT: umov.b w14, v0[3] +; CHECK-GI-NEXT: umov.b w17, v0[5] +; CHECK-GI-NEXT: umov.b w12, v0[3] +; CHECK-GI-NEXT: and w8, w8, #0x1 ; CHECK-GI-NEXT: and w10, w10, #0x1 -; CHECK-GI-NEXT: and w9, w9, #0x1 -; CHECK-GI-NEXT: bfi w8, w10, #1, #31 -; CHECK-GI-NEXT: umov.b w10, v1[1] -; CHECK-GI-NEXT: and w12, w12, #0x1 -; CHECK-GI-NEXT: bfi w11, w9, #1, #31 -; CHECK-GI-NEXT: umov.b w9, v1[0] -; CHECK-GI-NEXT: and w13, w13, #0x1 -; CHECK-GI-NEXT: orr w8, w8, w12, lsl #2 -; CHECK-GI-NEXT: umov.b w12, v1[2] +; CHECK-GI-NEXT: umov.b w0, v1[1] +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: bfi w13, w10, #1, #31 +; CHECK-GI-NEXT: and w14, w14, #0x1 +; CHECK-GI-NEXT: umov.b w8, v1[0] +; CHECK-GI-NEXT: umov.b w10, v1[2] ; CHECK-GI-NEXT: and w15, w15, #0x1 -; CHECK-GI-NEXT: orr w11, w11, w13, lsl #2 -; CHECK-GI-NEXT: umov.b w13, v0[5] +; CHECK-GI-NEXT: orr w13, w13, w14, lsl #2 +; CHECK-GI-NEXT: umov.b w14, v1[3] +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: and w0, w0, #0x1 ; CHECK-GI-NEXT: and w16, w16, #0x1 -; CHECK-GI-NEXT: orr w8, w8, w15, lsl #3 -; CHECK-GI-NEXT: umov.b w15, v1[3] +; CHECK-GI-NEXT: orr w9, w9, w11, lsl #2 +; CHECK-GI-NEXT: orr w13, w13, w15, lsl #3 +; CHECK-GI-NEXT: umov.b w15, v1[4] +; CHECK-GI-NEXT: umov.b w11, v0[6] +; CHECK-GI-NEXT: bfi w8, w0, #1, #31 ; CHECK-GI-NEXT: and w10, w10, #0x1 -; CHECK-GI-NEXT: bfi w9, w10, #1, #31 -; CHECK-GI-NEXT: umov.b w10, v0[6] +; CHECK-GI-NEXT: and w17, w17, #0x1 +; CHECK-GI-NEXT: orr w13, w13, w16, lsl #4 ; CHECK-GI-NEXT: and w14, w14, #0x1 -; CHECK-GI-NEXT: orr w8, w8, w16, lsl #4 -; CHECK-GI-NEXT: umov.b w16, v1[4] +; CHECK-GI-NEXT: umov.b w0, v0[7] +; CHECK-GI-NEXT: orr w8, w8, w10, lsl #2 +; CHECK-GI-NEXT: umov.b w10, v1[5] +; CHECK-GI-NEXT: umov.b w16, v1[6] +; CHECK-GI-NEXT: orr w13, w13, w17, lsl #5 +; CHECK-GI-NEXT: umov.b w17, v0[4] +; CHECK-GI-NEXT: and w15, w15, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w14, lsl #3 ; CHECK-GI-NEXT: and w12, w12, #0x1 -; CHECK-GI-NEXT: orr w9, w9, w12, lsl #2 +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: umov.b w14, v1[7] +; CHECK-GI-NEXT: orr w9, w9, w12, lsl #3 +; CHECK-GI-NEXT: orr w11, w13, w11, lsl #6 +; CHECK-GI-NEXT: orr w8, w8, w15, lsl #4 +; CHECK-GI-NEXT: umov.b w15, v0[5] +; CHECK-GI-NEXT: and w10, w10, #0x1 +; CHECK-GI-NEXT: and w0, w0, #0x1 +; CHECK-GI-NEXT: and w12, w17, #0x1 +; CHECK-GI-NEXT: umov.b w13, v0[1] +; CHECK-GI-NEXT: orr w8, w8, w10, lsl #5 +; CHECK-GI-NEXT: and w16, w16, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w12, lsl #4 +; CHECK-GI-NEXT: umov.b w10, v0[0] +; CHECK-GI-NEXT: orr w11, w11, w0, lsl #7 +; CHECK-GI-NEXT: and w14, w14, #0x1 +; CHECK-GI-NEXT: and w12, w15, #0x1 +; CHECK-GI-NEXT: umov.b w15, v0[2] +; CHECK-GI-NEXT: orr w8, w8, w16, lsl #6 +; CHECK-GI-NEXT: orr w9, w9, w12, lsl #5 +; CHECK-GI-NEXT: umov.b w12, v0[6] +; CHECK-GI-NEXT: strb w11, [sp, #8] +; CHECK-GI-NEXT: and w11, w13, #0x1 +; CHECK-GI-NEXT: umov.b w13, v0[3] +; CHECK-GI-NEXT: orr w8, w8, w14, lsl #7 +; CHECK-GI-NEXT: umov.b w14, v0[7] +; CHECK-GI-NEXT: ldr b0, [sp, #8] +; CHECK-GI-NEXT: bfi w10, w11, #1, #31 +; CHECK-GI-NEXT: and w11, w15, #0x1 +; CHECK-GI-NEXT: strb w8, [sp, #9] +; CHECK-GI-NEXT: umov.b w15, v0[4] +; CHECK-GI-NEXT: and w8, w12, #0x1 +; CHECK-GI-NEXT: orr w10, w10, w11, lsl #2 +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #6 +; CHECK-GI-NEXT: and w9, w13, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[1] +; CHECK-GI-NEXT: orr w9, w10, w9, lsl #3 +; CHECK-GI-NEXT: umov.b w10, v0[5] +; CHECK-GI-NEXT: umov.b w12, v0[0] +; CHECK-GI-NEXT: and w13, w14, #0x1 +; CHECK-GI-NEXT: umov.b w16, v0[2] +; CHECK-GI-NEXT: umov.b w17, v0[3] +; CHECK-GI-NEXT: and w14, w15, #0x1 +; CHECK-GI-NEXT: umov.b w15, v0[2] +; CHECK-GI-NEXT: orr w8, w8, w13, lsl #7 +; CHECK-GI-NEXT: orr w9, w9, w14, lsl #4 +; CHECK-GI-NEXT: umov.b w13, v0[6] +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: umov.b w14, v0[3] +; CHECK-GI-NEXT: strb w8, [sp, #10] +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: bfi w12, w11, #1, #31 +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #5 +; CHECK-GI-NEXT: umov.b w10, v0[4] +; CHECK-GI-NEXT: and w9, w15, #0x1 +; CHECK-GI-NEXT: umov.b w11, v0[7] +; CHECK-GI-NEXT: umov.b w15, v0[1] +; CHECK-GI-NEXT: orr w9, w12, w9, lsl #2 +; CHECK-GI-NEXT: umov.b w12, v0[5] ; CHECK-GI-NEXT: and w13, w13, #0x1 -; CHECK-GI-NEXT: umov.b w12, v0[4] -; CHECK-GI-NEXT: orr w8, w8, w13, lsl #5 -; CHECK-GI-NEXT: umov.b w13, v1[5] +; CHECK-GI-NEXT: and w14, w14, #0x1 +; CHECK-GI-NEXT: orr w8, w8, w13, lsl #6 +; CHECK-GI-NEXT: umov.b w13, v0[0] +; CHECK-GI-NEXT: orr w9, w9, w14, lsl #3 +; CHECK-GI-NEXT: and w10, w10, #0x1 +; CHECK-GI-NEXT: umov.b w14, v0[6] +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: and w15, w15, #0x1 +; CHECK-GI-NEXT: umov.b w0, v0[3] +; CHECK-GI-NEXT: orr w9, w9, w10, lsl #4 +; CHECK-GI-NEXT: and w10, w12, #0x1 +; CHECK-GI-NEXT: umov.b w12, v0[7] +; CHECK-GI-NEXT: orr w8, w8, w11, lsl #7 +; CHECK-GI-NEXT: bfi w13, w15, #1, #31 +; CHECK-GI-NEXT: and w11, w16, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w10, lsl #5 +; CHECK-GI-NEXT: and w10, w14, #0x1 +; CHECK-GI-NEXT: umov.b w14, v0[4] +; CHECK-GI-NEXT: strb w8, [sp, #11] +; CHECK-GI-NEXT: umov.b w15, v0[1] +; CHECK-GI-NEXT: umov.b w16, v0[3] +; CHECK-GI-NEXT: orr w8, w9, w10, lsl #6 +; CHECK-GI-NEXT: orr w9, w13, w11, lsl #2 +; CHECK-GI-NEXT: and w10, w12, #0x1 +; CHECK-GI-NEXT: and w11, w17, #0x1 +; CHECK-GI-NEXT: umov.b w12, v0[5] +; CHECK-GI-NEXT: umov.b w17, v0[0] +; CHECK-GI-NEXT: orr w8, w8, w10, lsl #7 +; CHECK-GI-NEXT: orr w9, w9, w11, lsl #3 +; CHECK-GI-NEXT: umov.b w10, v0[1] +; CHECK-GI-NEXT: and w11, w14, #0x1 +; CHECK-GI-NEXT: umov.b w14, v0[0] ; CHECK-GI-NEXT: and w15, w15, #0x1 -; CHECK-GI-NEXT: orr w9, w9, w15, lsl #3 +; CHECK-GI-NEXT: orr w9, w9, w11, lsl #4 +; CHECK-GI-NEXT: umov.b w11, v0[2] +; CHECK-GI-NEXT: umov.b w13, v0[6] +; CHECK-GI-NEXT: and w12, w12, #0x1 +; CHECK-GI-NEXT: bfi w17, w15, #1, #31 +; CHECK-GI-NEXT: umov.b w15, v0[5] +; CHECK-GI-NEXT: orr w9, w9, w12, lsl #5 ; CHECK-GI-NEXT: and w10, w10, #0x1 -; CHECK-GI-NEXT: umov.b w15, v0[7] -; CHECK-GI-NEXT: orr w8, w8, w10, lsl #6 -; CHECK-GI-NEXT: umov.b w10, v1[6] -; CHECK-GI-NEXT: and w16, w16, #0x1 -; CHECK-GI-NEXT: orr w9, w9, w16, lsl #4 -; CHECK-GI-NEXT: umov.b w16, v0[5] -; CHECK-GI-NEXT: orr w11, w11, w14, lsl #3 +; CHECK-GI-NEXT: umov.b w12, v0[2] +; CHECK-GI-NEXT: bfi w14, w10, #1, #31 +; CHECK-GI-NEXT: umov.b w10, v0[4] +; CHECK-GI-NEXT: ldr b1, [sp, #9] +; CHECK-GI-NEXT: and w11, w11, #0x1 ; CHECK-GI-NEXT: and w13, w13, #0x1 -; CHECK-GI-NEXT: umov.b w14, v1[7] +; CHECK-GI-NEXT: strb w8, [sp, #12] +; CHECK-GI-NEXT: orr w11, w14, w11, lsl #2 +; CHECK-GI-NEXT: and w14, w16, #0x1 +; CHECK-GI-NEXT: umov.b w16, v0[4] ; CHECK-GI-NEXT: and w12, w12, #0x1 -; CHECK-GI-NEXT: orr w9, w9, w13, lsl #5 -; CHECK-GI-NEXT: umov.b w13, v0[6] -; CHECK-GI-NEXT: orr w11, w11, w12, lsl #4 +; CHECK-GI-NEXT: and w15, w15, #0x1 +; CHECK-GI-NEXT: orr w9, w9, w13, lsl #6 +; CHECK-GI-NEXT: orr w11, w11, w14, lsl #3 +; CHECK-GI-NEXT: orr w12, w17, w12, lsl #2 ; CHECK-GI-NEXT: and w10, w10, #0x1 -; CHECK-GI-NEXT: and w12, w15, #0x1 +; CHECK-GI-NEXT: and w17, w0, #0x1 +; CHECK-GI-NEXT: umov.b w0, v0[5] +; CHECK-GI-NEXT: umov.b w14, v0[6] +; CHECK-GI-NEXT: orr w10, w11, w10, lsl #4 +; CHECK-GI-NEXT: orr w12, w12, w17, lsl #3 +; CHECK-GI-NEXT: umov.b w11, v0[7] +; CHECK-GI-NEXT: and w16, w16, #0x1 +; CHECK-GI-NEXT: umov.b w17, v0[6] +; CHECK-GI-NEXT: orr w10, w10, w15, lsl #5 ; CHECK-GI-NEXT: umov.b w15, v0[7] -; CHECK-GI-NEXT: orr w9, w9, w10, lsl #6 -; CHECK-GI-NEXT: and w10, w16, #0x1 -; CHECK-GI-NEXT: orr w8, w8, w12, lsl #7 -; CHECK-GI-NEXT: orr w10, w11, w10, lsl #5 -; CHECK-GI-NEXT: and w11, w14, #0x1 +; CHECK-GI-NEXT: orr w12, w12, w16, lsl #4 +; CHECK-GI-NEXT: and w16, w0, #0x1 +; CHECK-GI-NEXT: umov.b w0, v0[7] +; CHECK-GI-NEXT: and w14, w14, #0x1 +; CHECK-GI-NEXT: orr w12, w12, w16, lsl #5 +; CHECK-GI-NEXT: orr w10, w10, w14, lsl #6 +; CHECK-GI-NEXT: and w11, w11, #0x1 +; CHECK-GI-NEXT: and w13, w17, #0x1 ; CHECK-GI-NEXT: orr w9, w9, w11, lsl #7 -; CHECK-GI-NEXT: and w11, w13, #0x1 -; CHECK-GI-NEXT: strb w8, [sp, #8] -; CHECK-GI-NEXT: orr w8, w10, w11, lsl #6 -; CHECK-GI-NEXT: ldr b0, [sp, #8] -; CHECK-GI-NEXT: strb w9, [sp, #9] -; CHECK-GI-NEXT: and w9, w15, #0x1 -; CHECK-GI-NEXT: ldr b1, [sp, #9] -; CHECK-GI-NEXT: orr w8, w8, w9, lsl #7 ; CHECK-GI-NEXT: mov.s v0[1], v1[0] -; CHECK-GI-NEXT: strb w8, [sp, #10] -; CHECK-GI-NEXT: strb w8, [sp, #11] +; CHECK-GI-NEXT: orr w11, w12, w13, lsl #6 +; CHECK-GI-NEXT: and w12, w15, #0x1 ; CHECK-GI-NEXT: ; kill: def $d0 killed $d0 killed $q0 -; CHECK-GI-NEXT: strb w8, [sp, #12] -; CHECK-GI-NEXT: strb w8, [sp, #13] +; CHECK-GI-NEXT: orr w8, w10, w12, lsl #7 +; CHECK-GI-NEXT: and w10, w0, #0x1 +; CHECK-GI-NEXT: strb w9, [sp, #13] +; CHECK-GI-NEXT: orr w9, w11, w10, lsl #7 ; CHECK-GI-NEXT: strb w8, [sp, #14] -; CHECK-GI-NEXT: strb w8, [sp, #15] +; CHECK-GI-NEXT: strb w9, [sp, #15] ; CHECK-GI-NEXT: add sp, sp, #16 ; CHECK-GI-NEXT: ret %bc = bitcast <16 x i1> %arg to <2 x i8> diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll index bd68b213ec988..1164e02a16c9e 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fadd-strict.ll @@ -79,10 +79,11 @@ define half @add_HalfH(<4 x half> %bin.rdx) { ; CHECK-GI-FP16-LABEL: add_HalfH: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov h1, v0.h[2] -; CHECK-GI-FP16-NEXT: faddp h2, v0.2h +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] +; CHECK-GI-FP16-NEXT: fadd h1, h0, h1 ; CHECK-GI-FP16-NEXT: mov h0, v0.h[3] -; CHECK-GI-FP16-NEXT: fadd h1, h2, h1 +; CHECK-GI-FP16-NEXT: fadd h1, h1, h2 ; CHECK-GI-FP16-NEXT: fadd h0, h1, h0 ; CHECK-GI-FP16-NEXT: ret %r = call half @llvm.vector.reduce.fadd.f16.v4f16(half -0.0, <4 x half> %bin.rdx) diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll index 1906ca9defa40..1d295a30a994b 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization-nan.ll @@ -44,11 +44,27 @@ define half @test_v1f16(<1 x half> %a) nounwind { } define float @test_v1f32(<1 x float> %a) nounwind { -; CHECK-LABEL: test_v1f32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: ret +; CHECK-NOFP-SD-LABEL: test_v1f32: +; CHECK-NOFP-SD: // %bb.0: +; CHECK-NOFP-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NOFP-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NOFP-SD-NEXT: ret +; +; CHECK-FP-SD-LABEL: test_v1f32: +; CHECK-FP-SD: // %bb.0: +; CHECK-FP-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-FP-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-SD-NEXT: ret +; +; CHECK-NOFP-GI-LABEL: test_v1f32: +; CHECK-NOFP-GI: // %bb.0: +; CHECK-NOFP-GI-NEXT: // kill: def $s0 killed $s0 killed $d0 +; CHECK-NOFP-GI-NEXT: ret +; +; CHECK-FP-GI-LABEL: test_v1f32: +; CHECK-FP-GI: // %bb.0: +; CHECK-FP-GI-NEXT: // kill: def $s0 killed $s0 killed $d0 +; CHECK-FP-GI-NEXT: ret %b = call float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a) ret float %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll index 152eb66ebcdfe..ee2af110c84cd 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmax-legalization.ll @@ -44,11 +44,27 @@ define half @test_v1f16(<1 x half> %a) nounwind { } define float @test_v1f32(<1 x float> %a) nounwind { -; CHECK-LABEL: test_v1f32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: ret +; CHECK-NOFP-SD-LABEL: test_v1f32: +; CHECK-NOFP-SD: // %bb.0: +; CHECK-NOFP-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NOFP-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NOFP-SD-NEXT: ret +; +; CHECK-FP-SD-LABEL: test_v1f32: +; CHECK-FP-SD: // %bb.0: +; CHECK-FP-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-FP-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-SD-NEXT: ret +; +; CHECK-NOFP-GI-LABEL: test_v1f32: +; CHECK-NOFP-GI: // %bb.0: +; CHECK-NOFP-GI-NEXT: // kill: def $s0 killed $s0 killed $d0 +; CHECK-NOFP-GI-NEXT: ret +; +; CHECK-FP-GI-LABEL: test_v1f32: +; CHECK-FP-GI: // %bb.0: +; CHECK-FP-GI-NEXT: // kill: def $s0 killed $s0 killed $d0 +; CHECK-FP-GI-NEXT: ret %b = call nnan float @llvm.vector.reduce.fmax.v1f32(<1 x float> %a) ret float %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll index a1b7118d8080d..be61f9b521795 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmaximum.ll @@ -40,11 +40,27 @@ define half @test_v1f16(<1 x half> %a) nounwind { } define float @test_v1f32(<1 x float> %a) nounwind { -; CHECK-LABEL: test_v1f32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: ret +; CHECK-NOFP-SD-LABEL: test_v1f32: +; CHECK-NOFP-SD: // %bb.0: +; CHECK-NOFP-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NOFP-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NOFP-SD-NEXT: ret +; +; CHECK-FP-SD-LABEL: test_v1f32: +; CHECK-FP-SD: // %bb.0: +; CHECK-FP-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-FP-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-SD-NEXT: ret +; +; CHECK-NOFP-GI-LABEL: test_v1f32: +; CHECK-NOFP-GI: // %bb.0: +; CHECK-NOFP-GI-NEXT: // kill: def $s0 killed $s0 killed $d0 +; CHECK-NOFP-GI-NEXT: ret +; +; CHECK-FP-GI-LABEL: test_v1f32: +; CHECK-FP-GI: // %bb.0: +; CHECK-FP-GI-NEXT: // kill: def $s0 killed $s0 killed $d0 +; CHECK-FP-GI-NEXT: ret %b = call float @llvm.vector.reduce.fmaximum.v1f32(<1 x float> %a) ret float %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll index d5f999add22c2..300081dc3ec40 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmin-legalization.ll @@ -44,11 +44,27 @@ define half @test_v1f16(<1 x half> %a) nounwind { } define float @test_v1f32(<1 x float> %a) nounwind { -; CHECK-LABEL: test_v1f32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: ret +; CHECK-NOFP-SD-LABEL: test_v1f32: +; CHECK-NOFP-SD: // %bb.0: +; CHECK-NOFP-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NOFP-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NOFP-SD-NEXT: ret +; +; CHECK-FP-SD-LABEL: test_v1f32: +; CHECK-FP-SD: // %bb.0: +; CHECK-FP-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-FP-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-SD-NEXT: ret +; +; CHECK-NOFP-GI-LABEL: test_v1f32: +; CHECK-NOFP-GI: // %bb.0: +; CHECK-NOFP-GI-NEXT: // kill: def $s0 killed $s0 killed $d0 +; CHECK-NOFP-GI-NEXT: ret +; +; CHECK-FP-GI-LABEL: test_v1f32: +; CHECK-FP-GI: // %bb.0: +; CHECK-FP-GI-NEXT: // kill: def $s0 killed $s0 killed $d0 +; CHECK-FP-GI-NEXT: ret %b = call nnan float @llvm.vector.reduce.fmin.v1f32(<1 x float> %a) ret float %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll b/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll index 719cac8f33028..e735f670ced0c 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fminimum.ll @@ -40,11 +40,27 @@ define half @test_v1f16(<1 x half> %a) nounwind { } define float @test_v1f32(<1 x float> %a) nounwind { -; CHECK-LABEL: test_v1f32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0 -; CHECK-NEXT: ret +; CHECK-NOFP-SD-LABEL: test_v1f32: +; CHECK-NOFP-SD: // %bb.0: +; CHECK-NOFP-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-NOFP-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-NOFP-SD-NEXT: ret +; +; CHECK-FP-SD-LABEL: test_v1f32: +; CHECK-FP-SD: // %bb.0: +; CHECK-FP-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-FP-SD-NEXT: // kill: def $s0 killed $s0 killed $q0 +; CHECK-FP-SD-NEXT: ret +; +; CHECK-NOFP-GI-LABEL: test_v1f32: +; CHECK-NOFP-GI: // %bb.0: +; CHECK-NOFP-GI-NEXT: // kill: def $s0 killed $s0 killed $d0 +; CHECK-NOFP-GI-NEXT: ret +; +; CHECK-FP-GI-LABEL: test_v1f32: +; CHECK-FP-GI: // %bb.0: +; CHECK-FP-GI-NEXT: // kill: def $s0 killed $s0 killed $d0 +; CHECK-FP-GI-NEXT: ret %b = call float @llvm.vector.reduce.fminimum.v1f32(<1 x float> %a) ret float %b } diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll index e22a5a4af4fae..e1b21705c95f3 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul-strict.ll @@ -5,11 +5,18 @@ ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16 define float @mul_HalfS(<2 x float> %bin.rdx) { -; CHECK-LABEL: mul_HalfS: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmul s0, s0, v0.s[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: mul_HalfS: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: mul_HalfS: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s1 +; CHECK-GI-NEXT: ret %r = call float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx) ret float %r } @@ -72,9 +79,12 @@ define half @mul_HalfH(<4 x half> %bin.rdx) { ; CHECK-GI-FP16-LABEL: mul_HalfH: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: fmul h1, h0, v0.h[1] -; CHECK-GI-FP16-NEXT: fmul h1, h1, v0.h[2] -; CHECK-GI-FP16-NEXT: fmul h0, h1, v0.h[3] +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] +; CHECK-GI-FP16-NEXT: fmul h1, h0, h1 +; CHECK-GI-FP16-NEXT: mov h0, v0.h[3] +; CHECK-GI-FP16-NEXT: fmul h1, h1, h2 +; CHECK-GI-FP16-NEXT: fmul h0, h1, h0 ; CHECK-GI-FP16-NEXT: ret %r = call half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx) ret half %r @@ -465,6 +475,3 @@ declare float @llvm.vector.reduce.fmul.f32.v4f32(float, <4 x float>) declare float @llvm.vector.reduce.fmul.f32.v8f32(float, <8 x float>) declare double @llvm.vector.reduce.fmul.f64.v2f64(double, <2 x double>) declare double @llvm.vector.reduce.fmul.f64.v4f64(double, <4 x double>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-GI: {{.*}} -; CHECK-SD: {{.*}} diff --git a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll index 5fd705b07ca3b..2429cf4b4597a 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-fmul.ll @@ -5,11 +5,18 @@ ; RUN: llc -mtriple=aarch64 -mattr=+fullfp16 -global-isel -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-GI,CHECK-GI-FP16 define float @mul_HalfS(<2 x float> %bin.rdx) { -; CHECK-LABEL: mul_HalfS: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmul s0, s0, v0.s[1] -; CHECK-NEXT: ret +; CHECK-SD-LABEL: mul_HalfS: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmul s0, s0, v0.s[1] +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: mul_HalfS: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s1 +; CHECK-GI-NEXT: ret %r = call fast float @llvm.vector.reduce.fmul.f32.v2f32(float 1.0, <2 x float> %bin.rdx) ret float %r } @@ -44,17 +51,20 @@ define half @mul_HalfH(<4 x half> %bin.rdx) { ; CHECK-GI-NOFP16-NEXT: fcvtl v0.4s, v0.4h ; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1] ; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s -; CHECK-GI-NOFP16-NEXT: fmul s0, s0, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] +; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 ; CHECK-GI-NOFP16-NEXT: ret ; ; CHECK-GI-FP16-LABEL: mul_HalfH: ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-GI-FP16-NEXT: mov h1, v0.h[3] -; CHECK-GI-FP16-NEXT: fmul h2, h0, v0.h[1] -; CHECK-GI-FP16-NEXT: fmul h0, h1, v0.h[2] -; CHECK-GI-FP16-NEXT: fmul h0, h2, h0 +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] +; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] +; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 +; CHECK-GI-FP16-NEXT: fmul h1, h2, h3 +; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 ; CHECK-GI-FP16-NEXT: ret %r = call fast half @llvm.vector.reduce.fmul.f16.v4f16(half 1.0, <4 x half> %bin.rdx) ret half %r @@ -105,7 +115,8 @@ define half @mul_H(<8 x half> %bin.rdx) { ; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v1.4s, v0.4s ; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1] ; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s -; CHECK-GI-NOFP16-NEXT: fmul s0, s0, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] +; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 ; CHECK-GI-NOFP16-NEXT: ret ; @@ -113,10 +124,12 @@ define half @mul_H(<8 x half> %bin.rdx) { ; CHECK-GI-FP16: // %bb.0: ; CHECK-GI-FP16-NEXT: mov d1, v0.d[1] ; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h -; CHECK-GI-FP16-NEXT: mov h1, v0.h[3] -; CHECK-GI-FP16-NEXT: fmul h2, h0, v0.h[1] -; CHECK-GI-FP16-NEXT: fmul h0, h1, v0.h[2] -; CHECK-GI-FP16-NEXT: fmul h0, h2, h0 +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] +; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] +; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 +; CHECK-GI-FP16-NEXT: fmul h1, h2, h3 +; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 ; CHECK-GI-FP16-NEXT: ret %r = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %bin.rdx) ret half %r @@ -134,7 +147,8 @@ define float @mul_S(<4 x float> %bin.rdx) { ; CHECK-GI: // %bb.0: ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s -; CHECK-GI-NEXT: fmul s0, s0, v0.s[1] +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s1 ; CHECK-GI-NEXT: ret %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %bin.rdx) ret float %r @@ -206,7 +220,8 @@ define half @mul_2H(<16 x half> %bin.rdx) { ; CHECK-GI-NOFP16-NEXT: fmul v0.4s, v0.4s, v1.4s ; CHECK-GI-NOFP16-NEXT: mov d1, v0.d[1] ; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v1.2s -; CHECK-GI-NOFP16-NEXT: fmul s0, s0, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov s1, v0.s[1] +; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s1 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 ; CHECK-GI-NOFP16-NEXT: ret ; @@ -215,10 +230,12 @@ define half @mul_2H(<16 x half> %bin.rdx) { ; CHECK-GI-FP16-NEXT: fmul v0.8h, v0.8h, v1.8h ; CHECK-GI-FP16-NEXT: mov d1, v0.d[1] ; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v1.4h -; CHECK-GI-FP16-NEXT: mov h1, v0.h[3] -; CHECK-GI-FP16-NEXT: fmul h2, h0, v0.h[1] -; CHECK-GI-FP16-NEXT: fmul h0, h1, v0.h[2] -; CHECK-GI-FP16-NEXT: fmul h0, h2, h0 +; CHECK-GI-FP16-NEXT: mov h1, v0.h[1] +; CHECK-GI-FP16-NEXT: mov h2, v0.h[2] +; CHECK-GI-FP16-NEXT: mov h3, v0.h[3] +; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 +; CHECK-GI-FP16-NEXT: fmul h1, h2, h3 +; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 ; CHECK-GI-FP16-NEXT: ret %r = call fast half @llvm.vector.reduce.fmul.f16.v16f16(half 1.0, <16 x half> %bin.rdx) ret half %r @@ -238,7 +255,8 @@ define float @mul_2S(<8 x float> %bin.rdx) { ; CHECK-GI-NEXT: fmul v0.4s, v0.4s, v1.4s ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s -; CHECK-GI-NEXT: fmul s0, s0, v0.s[1] +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s1 ; CHECK-GI-NEXT: ret %r = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %bin.rdx) ret float %r @@ -271,8 +289,9 @@ define float @mul_S_init_42(<4 x float> %bin.rdx) { ; CHECK-GI-NEXT: mov d1, v0.d[1] ; CHECK-GI-NEXT: mov w8, #1109917696 // =0x42280000 ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v1.2s +; CHECK-GI-NEXT: mov s1, v0.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s1 ; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: fmul s0, s0, v0.s[1] ; CHECK-GI-NEXT: fmul s0, s0, s1 ; CHECK-GI-NEXT: ret %r = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 42.0, <4 x float> %bin.rdx) @@ -338,8 +357,10 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-NOFP16-NEXT: mov d3, v1.d[1] ; CHECK-GI-NOFP16-NEXT: fmul v0.2s, v0.2s, v2.2s ; CHECK-GI-NOFP16-NEXT: fmul v1.2s, v1.2s, v3.2s -; CHECK-GI-NOFP16-NEXT: fmul s0, s0, v0.s[1] -; CHECK-GI-NOFP16-NEXT: fmul s1, s1, v1.s[1] +; CHECK-GI-NOFP16-NEXT: mov s2, v0.s[1] +; CHECK-GI-NOFP16-NEXT: mov s3, v1.s[1] +; CHECK-GI-NOFP16-NEXT: fmul s0, s0, s2 +; CHECK-GI-NOFP16-NEXT: fmul s1, s1, s3 ; CHECK-GI-NOFP16-NEXT: fcvt h0, s0 ; CHECK-GI-NOFP16-NEXT: fcvt h1, s1 ; CHECK-GI-NOFP16-NEXT: fcvt s0, h0 @@ -354,14 +375,18 @@ define half @fmul_reduct_reassoc_v8f16(<8 x half> %a, <8 x half> %b) { ; CHECK-GI-FP16-NEXT: mov d3, v1.d[1] ; CHECK-GI-FP16-NEXT: fmul v0.4h, v0.4h, v2.4h ; CHECK-GI-FP16-NEXT: fmul v1.4h, v1.4h, v3.4h -; CHECK-GI-FP16-NEXT: mov h2, v0.h[3] -; CHECK-GI-FP16-NEXT: mov h3, v1.h[3] -; CHECK-GI-FP16-NEXT: fmul h4, h0, v0.h[1] -; CHECK-GI-FP16-NEXT: fmul h0, h2, v0.h[2] -; CHECK-GI-FP16-NEXT: fmul h2, h1, v1.h[1] -; CHECK-GI-FP16-NEXT: fmul h1, h3, v1.h[2] -; CHECK-GI-FP16-NEXT: fmul h0, h4, h0 -; CHECK-GI-FP16-NEXT: fmul h1, h2, h1 +; CHECK-GI-FP16-NEXT: mov h2, v0.h[1] +; CHECK-GI-FP16-NEXT: mov h3, v0.h[2] +; CHECK-GI-FP16-NEXT: mov h4, v0.h[3] +; CHECK-GI-FP16-NEXT: mov h5, v1.h[1] +; CHECK-GI-FP16-NEXT: mov h6, v1.h[2] +; CHECK-GI-FP16-NEXT: mov h7, v1.h[3] +; CHECK-GI-FP16-NEXT: fmul h0, h0, h2 +; CHECK-GI-FP16-NEXT: fmul h2, h3, h4 +; CHECK-GI-FP16-NEXT: fmul h1, h1, h5 +; CHECK-GI-FP16-NEXT: fmul h3, h6, h7 +; CHECK-GI-FP16-NEXT: fmul h0, h0, h2 +; CHECK-GI-FP16-NEXT: fmul h1, h1, h3 ; CHECK-GI-FP16-NEXT: fmul h0, h0, h1 ; CHECK-GI-FP16-NEXT: ret %r1 = call fast half @llvm.vector.reduce.fmul.f16.v8f16(half 1.0, <8 x half> %a) @@ -389,8 +414,10 @@ define float @fmul_reduct_reassoc_v8f32(<8 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s ; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s -; CHECK-GI-NEXT: fmul s0, s0, v0.s[1] -; CHECK-GI-NEXT: fmul s1, s1, v1.s[1] +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov s3, v1.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s2 +; CHECK-GI-NEXT: fmul s1, s1, s3 ; CHECK-GI-NEXT: fmul s0, s0, s1 ; CHECK-GI-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fmul.f32.v8f32(float 1.0, <8 x float> %a) @@ -414,8 +441,10 @@ define float @fmul_reduct_reassoc_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s ; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s -; CHECK-GI-NEXT: fmul s0, s0, v0.s[1] -; CHECK-GI-NEXT: fmul s1, s1, v1.s[1] +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov s3, v1.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s2 +; CHECK-GI-NEXT: fmul s1, s1, s3 ; CHECK-GI-NEXT: fmul s0, s0, s1 ; CHECK-GI-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) @@ -442,10 +471,12 @@ define float @fmul_reduct_reassoc_v4f32_init(float %i, <4 x float> %a, <4 x floa ; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s ; CHECK-GI-NEXT: mov d3, v2.d[1] -; CHECK-GI-NEXT: fmul s1, s1, v1.s[1] +; CHECK-GI-NEXT: mov s4, v1.s[1] ; CHECK-GI-NEXT: fmul v2.2s, v2.2s, v3.2s +; CHECK-GI-NEXT: fmul s1, s1, s4 +; CHECK-GI-NEXT: mov s3, v2.s[1] ; CHECK-GI-NEXT: fmul s0, s0, s1 -; CHECK-GI-NEXT: fmul s1, s2, v2.s[1] +; CHECK-GI-NEXT: fmul s1, s2, s3 ; CHECK-GI-NEXT: fmul s0, s0, s1 ; CHECK-GI-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float %i, <4 x float> %a) @@ -471,8 +502,10 @@ define float @fmul_reduct_reassoc_v4v8f32(<4 x float> %a, <8 x float> %b) { ; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s ; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s -; CHECK-GI-NEXT: fmul s0, s0, v0.s[1] -; CHECK-GI-NEXT: fmul s1, s1, v1.s[1] +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov s3, v1.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s2 +; CHECK-GI-NEXT: fmul s1, s1, s3 ; CHECK-GI-NEXT: fmul s0, s0, s1 ; CHECK-GI-NEXT: ret %r1 = call fast float @llvm.vector.reduce.fmul.f32.v4f32(float 1.0, <4 x float> %a) @@ -523,8 +556,10 @@ define float @fmul_reduct_reassoc_v4f32_extrause(<4 x float> %a, <4 x float> %b) ; CHECK-GI-NEXT: mov d3, v1.d[1] ; CHECK-GI-NEXT: fmul v0.2s, v0.2s, v2.2s ; CHECK-GI-NEXT: fmul v1.2s, v1.2s, v3.2s -; CHECK-GI-NEXT: fmul s0, s0, v0.s[1] -; CHECK-GI-NEXT: fmul s1, s1, v1.s[1] +; CHECK-GI-NEXT: mov s2, v0.s[1] +; CHECK-GI-NEXT: mov s3, v1.s[1] +; CHECK-GI-NEXT: fmul s0, s0, s2 +; CHECK-GI-NEXT: fmul s1, s1, s3 ; CHECK-GI-NEXT: fmul s1, s0, s1 ; CHECK-GI-NEXT: fmul s0, s1, s0 ; CHECK-GI-NEXT: ret diff --git a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll index d5c040e09945b..0806f7da5c89c 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-umax-legalization.ll @@ -57,11 +57,16 @@ define i24 @test_v1i24(<1 x i24> %a) nounwind { } define i32 @test_v1i32(<1 x i32> %a) nounwind { -; CHECK-LABEL: test_v1i32: -; CHECK: // %bb.0: -; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-NEXT: fmov w0, s0 -; CHECK-NEXT: ret +; CHECK-SD-LABEL: test_v1i32: +; CHECK-SD: // %bb.0: +; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-SD-NEXT: fmov w0, s0 +; CHECK-SD-NEXT: ret +; +; CHECK-GI-LABEL: test_v1i32: +; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: fmov w0, s0 +; CHECK-GI-NEXT: ret %b = call i32 @llvm.vector.reduce.umax.v1i32(<1 x i32> %a) ret i32 %b } diff --git a/llvm/test/CodeGen/AArch64/vector-lrint.ll b/llvm/test/CodeGen/AArch64/vector-lrint.ll index 53456c4c81ccc..602643264e7be 100644 --- a/llvm/test/CodeGen/AArch64/vector-lrint.ll +++ b/llvm/test/CodeGen/AArch64/vector-lrint.ll @@ -755,13 +755,20 @@ define <1 x iXLen> @lrint_v1f32(<1 x float> %x) { ; CHECK-i32-NEXT: fcvtzs v0.2s, v0.2s ; CHECK-i32-NEXT: ret ; -; CHECK-i64-LABEL: lrint_v1f32: -; CHECK-i64: // %bb.0: -; CHECK-i64-NEXT: // kill: def $d0 killed $d0 def $q0 -; CHECK-i64-NEXT: frintx s0, s0 -; CHECK-i64-NEXT: fcvtzs x8, s0 -; CHECK-i64-NEXT: fmov d0, x8 -; CHECK-i64-NEXT: ret +; CHECK-i64-SD-LABEL: lrint_v1f32: +; CHECK-i64-SD: // %bb.0: +; CHECK-i64-SD-NEXT: // kill: def $d0 killed $d0 def $q0 +; CHECK-i64-SD-NEXT: frintx s0, s0 +; CHECK-i64-SD-NEXT: fcvtzs x8, s0 +; CHECK-i64-SD-NEXT: fmov d0, x8 +; CHECK-i64-SD-NEXT: ret +; +; CHECK-i64-GI-LABEL: lrint_v1f32: +; CHECK-i64-GI: // %bb.0: +; CHECK-i64-GI-NEXT: frintx s0, s0 +; CHECK-i64-GI-NEXT: fcvtzs x8, s0 +; CHECK-i64-GI-NEXT: fmov d0, x8 +; CHECK-i64-GI-NEXT: ret %a = call <1 x iXLen> @llvm.lrint.v1iXLen.v1f32(<1 x float> %x) ret <1 x iXLen> %a } @@ -1328,7 +1335,3 @@ define <32 x iXLen> @lrint_v32f64(<32 x double> %x) { ret <32 x iXLen> %a } declare <32 x iXLen> @llvm.lrint.v32iXLen.v32f64(<32 x double>) -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-i32-GI: {{.*}} -; CHECK-i64-GI: {{.*}} -; CHECK-i64-SD: {{.*}} From 14c11e4bcb262496981a2948af11a3f9e9de23ef Mon Sep 17 00:00:00 2001 From: Adrian Vogelsgesang Date: Wed, 11 Jun 2025 09:39:31 +0200 Subject: [PATCH 041/851] [coro][NFC] Move switch basic block to beginning of coroutine (#143626) This makes the code flow when reading the LLVM IR of a split coroutine a bit more natural. It does not change anything from an end-user perspective but makes debugging the CoroSplit pass slightly easier. --- llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index f9a6c70fedc2d..cebe44581b061 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -703,6 +703,7 @@ void coro::BaseCloner::replaceEntryBlock() { auto *SwitchBB = cast(VMap[Shape.SwitchLowering.ResumeEntryBlock]); Builder.CreateBr(SwitchBB); + SwitchBB->moveAfter(Entry); break; } case coro::ABI::Async: From 24d730b3808a562507f3f1f5fc125acf4b6e03aa Mon Sep 17 00:00:00 2001 From: Iris Shi <0.0@owo.li> Date: Wed, 11 Jun 2025 15:56:37 +0800 Subject: [PATCH 042/851] Reland "[SelectionDAG] Make `(a & x) | (~a & y) -> (a & (x ^ y)) ^ y` available for all targets" (#143651) --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 57 ++ .../Target/SystemZ/SystemZISelLowering.cpp | 14 + llvm/lib/Target/SystemZ/SystemZISelLowering.h | 1 + llvm/lib/Target/X86/X86ISelLowering.cpp | 58 -- llvm/test/CodeGen/AMDGPU/bfi_int.ll | 30 +- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 42 +- llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll | 161 +++-- .../CodeGen/AMDGPU/insert_vector_elt.v2i16.ll | 42 +- ...unfold-masked-merge-scalar-variablemask.ll | 42 +- ...unfold-masked-merge-vector-variablemask.ll | 167 +++-- llvm/test/CodeGen/RISCV/fold-masked-merge.ll | 302 +++++++++ ...unfold-masked-merge-scalar-variablemask.ll | 62 +- .../test/CodeGen/SystemZ/fold-masked-merge.ll | 277 ++++++++ llvm/test/CodeGen/WebAssembly/simd-arith.ll | 600 +++++++----------- llvm/test/CodeGen/X86/bitselect.ll | 50 +- llvm/test/CodeGen/X86/fold-masked-merge.ll | 54 +- ...unfold-masked-merge-scalar-variablemask.ll | 26 +- ...unfold-masked-merge-vector-variablemask.ll | 598 +++++++++-------- 18 files changed, 1524 insertions(+), 1059 deletions(-) create mode 100644 llvm/test/CodeGen/RISCV/fold-masked-merge.ll create mode 100644 llvm/test/CodeGen/SystemZ/fold-masked-merge.ll diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index b65e8e06eae62..e79a17e86bc87 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -8128,6 +8128,59 @@ static SDValue visitORCommutative(SelectionDAG &DAG, SDValue N0, SDValue N1, return SDValue(); } +static SDValue foldMaskedMergeImpl(SDValue AndL0, SDValue AndR0, SDValue AndL1, + SDValue AndR1, const SDLoc &DL, + SelectionDAG &DAG) { + if (!isBitwiseNot(AndL0, true) || !AndL0->hasOneUse()) + return SDValue(); + SDValue NotOp = AndL0->getOperand(0); + if (NotOp == AndR1) + std::swap(AndR1, AndL1); + if (NotOp != AndL1) + return SDValue(); + + EVT VT = AndL1.getValueType(); + SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, AndR1, AndR0); + SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp); + SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, AndR0); + return Xor1; +} + +/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the +/// equivalent `((x ^ y) & m) ^ y)` pattern. +/// This is typically a better representation for targets without a fused +/// "and-not" operation. +static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG, + const TargetLowering &TLI, const SDLoc &DL) { + // Note that masked-merge variants using XOR or ADD expressions are + // normalized to OR by InstCombine so we only check for OR. + assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node"); + SDValue N0 = Node->getOperand(0); + if (N0->getOpcode() != ISD::AND || !N0->hasOneUse()) + return SDValue(); + SDValue N1 = Node->getOperand(1); + if (N1->getOpcode() != ISD::AND || !N1->hasOneUse()) + return SDValue(); + + // If the target supports and-not, don't fold this. + if (TLI.hasAndNot(SDValue(Node, 0))) + return SDValue(); + + SDValue N00 = N0->getOperand(0); + SDValue N01 = N0->getOperand(1); + SDValue N10 = N1->getOperand(0); + SDValue N11 = N1->getOperand(1); + if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG)) + return Result; + if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG)) + return Result; + return SDValue(); +} + SDValue DAGCombiner::visitOR(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -8306,6 +8359,10 @@ SDValue DAGCombiner::visitOR(SDNode *N) { if (SDValue R = foldLogicTreeOfShifts(N, N0, N1, DAG)) return R; + if (VT.isScalarInteger() && VT != MVT::i1) + if (SDValue R = foldMaskedMerge(N, DAG, TLI, DL)) + return R; + return SDValue(); } diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp index f06246706aaa9..1c59b1e63b7bc 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1283,6 +1283,20 @@ bool SystemZTargetLowering::allowsMisalignedMemoryAccesses( return true; } +bool SystemZTargetLowering::hasAndNot(SDValue Y) const { + EVT VT = Y.getValueType(); + + // We can use NC(G)RK for types in GPRs ... + if (VT == MVT::i32 || VT == MVT::i64) + return Subtarget.hasMiscellaneousExtensions3(); + + // ... or VNC for types in VRs. + if (VT.isVector() || VT == MVT::i128) + return Subtarget.hasVector(); + + return false; +} + // Information about the addressing mode for a memory access. struct AddressingMode { // True if a long displacement is supported. diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h index f3536a840fda8..f2f0bf6d8b410 100644 --- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -671,6 +671,7 @@ class SystemZTargetLowering : public TargetLowering { } unsigned getStackProbeSize(const MachineFunction &MF) const; + bool hasAndNot(SDValue Y) const override; private: const SystemZSubtarget &Subtarget; diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 8bcd8670879a9..96714adf78e43 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -52350,59 +52350,6 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG, return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), N->getValueType(0), Ret); } -static SDValue foldMaskedMergeImpl(SDValue And0_L, SDValue And0_R, - SDValue And1_L, SDValue And1_R, - const SDLoc &DL, SelectionDAG &DAG) { - if (!isBitwiseNot(And0_L, true) || !And0_L->hasOneUse()) - return SDValue(); - SDValue NotOp = And0_L->getOperand(0); - if (NotOp == And1_R) - std::swap(And1_R, And1_L); - if (NotOp != And1_L) - return SDValue(); - - // (~(NotOp) & And0_R) | (NotOp & And1_R) - // --> ((And0_R ^ And1_R) & NotOp) ^ And1_R - EVT VT = And1_L->getValueType(0); - SDValue Freeze_And0_R = DAG.getNode(ISD::FREEZE, SDLoc(), VT, And0_R); - SDValue Xor0 = DAG.getNode(ISD::XOR, DL, VT, And1_R, Freeze_And0_R); - SDValue And = DAG.getNode(ISD::AND, DL, VT, Xor0, NotOp); - SDValue Xor1 = DAG.getNode(ISD::XOR, DL, VT, And, Freeze_And0_R); - return Xor1; -} - -/// Fold "masked merge" expressions like `(m & x) | (~m & y)` into the -/// equivalent `((x ^ y) & m) ^ y)` pattern. -/// This is typically a better representation for targets without a fused -/// "and-not" operation. This function is intended to be called from a -/// `TargetLowering::PerformDAGCombine` callback on `ISD::OR` nodes. -static SDValue foldMaskedMerge(SDNode *Node, SelectionDAG &DAG) { - // Note that masked-merge variants using XOR or ADD expressions are - // normalized to OR by InstCombine so we only check for OR. - assert(Node->getOpcode() == ISD::OR && "Must be called with ISD::OR node"); - SDValue N0 = Node->getOperand(0); - if (N0->getOpcode() != ISD::AND || !N0->hasOneUse()) - return SDValue(); - SDValue N1 = Node->getOperand(1); - if (N1->getOpcode() != ISD::AND || !N1->hasOneUse()) - return SDValue(); - - SDLoc DL(Node); - SDValue N00 = N0->getOperand(0); - SDValue N01 = N0->getOperand(1); - SDValue N10 = N1->getOperand(0); - SDValue N11 = N1->getOperand(1); - if (SDValue Result = foldMaskedMergeImpl(N00, N01, N10, N11, DL, DAG)) - return Result; - if (SDValue Result = foldMaskedMergeImpl(N01, N00, N10, N11, DL, DAG)) - return Result; - if (SDValue Result = foldMaskedMergeImpl(N10, N11, N00, N01, DL, DAG)) - return Result; - if (SDValue Result = foldMaskedMergeImpl(N11, N10, N00, N01, DL, DAG)) - return Result; - return SDValue(); -} - /// If this is an add or subtract where one operand is produced by a cmp+setcc, /// then try to convert it to an ADC or SBB. This replaces TEST+SET+{ADD/SUB} /// with CMP+{ADC, SBB}. @@ -52806,11 +52753,6 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG, } } - // We should fold "masked merge" patterns when `andn` is not available. - if (!Subtarget.hasBMI() && VT.isScalarInteger() && VT != MVT::i1) - if (SDValue R = foldMaskedMerge(N, DAG)) - return R; - if (SDValue R = combineOrXorWithSETCC(N->getOpcode(), dl, VT, N0, N1, DAG)) return R; diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll index 201b97d479c68..b372dec383344 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -16,9 +16,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_andn2_b32 s2, s2, s0 +; GFX7-NEXT: s_xor_b32 s1, s1, s2 ; GFX7-NEXT: s_and_b32 s0, s1, s0 -; GFX7-NEXT: s_or_b32 s0, s2, s0 +; GFX7-NEXT: s_xor_b32 s0, s0, s2 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX7-NEXT: s_endpgm @@ -28,9 +28,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x2c ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_andn2_b32 s2, s2, s0 +; GFX8-NEXT: s_xor_b32 s1, s1, s2 ; GFX8-NEXT: s_and_b32 s0, s1, s0 -; GFX8-NEXT: s_or_b32 s0, s2, s0 +; GFX8-NEXT: s_xor_b32 s0, s0, s2 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -44,9 +44,9 @@ define amdgpu_kernel void @s_bfi_def_i32(ptr addrspace(1) %out, i32 %x, i32 %y, ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x24 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_andn2_b32 s2, s2, s0 +; GFX10-NEXT: s_xor_b32 s1, s1, s2 ; GFX10-NEXT: s_and_b32 s0, s1, s0 -; GFX10-NEXT: s_or_b32 s0, s2, s0 +; GFX10-NEXT: s_xor_b32 s0, s0, s2 ; GFX10-NEXT: v_mov_b32_e32 v1, s0 ; GFX10-NEXT: global_store_dword v0, v1, s[4:5] ; GFX10-NEXT: s_endpgm @@ -1407,9 +1407,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX7-NEXT: s_mov_b32 s7, 0xf000 ; GFX7-NEXT: s_mov_b32 s6, -1 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] -; GFX7-NEXT: s_andn2_b64 s[0:1], s[4:5], s[0:1] -; GFX7-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX7-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX7-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX7-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX7-NEXT: s_add_u32 s0, s0, 10 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -1422,9 +1422,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] -; GFX8-NEXT: s_andn2_b64 s[0:1], s[4:5], s[0:1] -; GFX8-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX8-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX8-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX8-NEXT: s_add_u32 s0, s0, 10 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 @@ -1438,9 +1438,9 @@ define amdgpu_kernel void @s_bitselect_i64_pat_0(i64 %a, i64 %b, i64 %mask) { ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: s_and_b64 s[2:3], s[0:1], s[2:3] -; GFX10-NEXT: s_andn2_b64 s[0:1], s[4:5], s[0:1] -; GFX10-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] +; GFX10-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] +; GFX10-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] +; GFX10-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] ; GFX10-NEXT: s_add_u32 s0, s0, 10 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 6925a98f643b9..e1b4cad370f96 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -289,16 +289,16 @@ entry: define amdgpu_kernel void @half4_inselt(ptr addrspace(1) %out, <4 x half> %vec, i32 %sel) { ; GCN-LABEL: half4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN-NEXT: s_mov_b32 s4, 0x3c003c00 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] ; GCN-NEXT: s_lshl_b32 s6, s6, 4 ; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 -; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -317,10 +317,10 @@ define amdgpu_kernel void @half2_inselt(ptr addrspace(1) %out, <2 x half> %vec, ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 +; GCN-NEXT: s_xor_b32 s4, s2, 0x3c003c00 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 -; GCN-NEXT: s_andn2_b32 s2, s2, s3 -; GCN-NEXT: s_and_b32 s3, s3, 0x3c003c00 -; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s3, s4, s3 +; GCN-NEXT: s_xor_b32 s2, s3, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -399,10 +399,10 @@ define amdgpu_kernel void @short2_inselt(ptr addrspace(1) %out, <2 x i16> %vec, ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_lshl_b32 s3, s3, 4 +; GCN-NEXT: s_xor_b32 s4, s2, 0x10001 ; GCN-NEXT: s_lshl_b32 s3, 0xffff, s3 -; GCN-NEXT: s_andn2_b32 s2, s2, s3 -; GCN-NEXT: s_and_b32 s3, s3, 0x10001 -; GCN-NEXT: s_or_b32 s2, s3, s2 +; GCN-NEXT: s_and_b32 s3, s4, s3 +; GCN-NEXT: s_xor_b32 s2, s3, s2 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 @@ -417,16 +417,16 @@ entry: define amdgpu_kernel void @short4_inselt(ptr addrspace(1) %out, <4 x i16> %vec, i32 %sel) { ; GCN-LABEL: short4_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN-NEXT: s_mov_b32 s4, 0x10001 ; GCN-NEXT: s_mov_b32 s5, s4 ; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_xor_b64 s[4:5], s[2:3], s[4:5] ; GCN-NEXT: s_lshl_b32 s6, s6, 4 ; GCN-NEXT: s_lshl_b64 s[6:7], 0xffff, s6 -; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[6:7] -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] -; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 @@ -442,15 +442,15 @@ entry: define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i32 %sel) { ; GCN-LABEL: byte8_inselt: ; GCN: ; %bb.0: ; %entry -; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x34 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshl_b32 s4, s6, 3 -; GCN-NEXT: s_lshl_b64 s[4:5], 0xff, s4 -; GCN-NEXT: s_and_b32 s7, s5, 0x1010101 -; GCN-NEXT: s_and_b32 s6, s4, 0x1010101 -; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] -; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] +; GCN-NEXT: s_xor_b32 s5, s3, 0x1010101 +; GCN-NEXT: s_lshl_b32 s6, s6, 3 +; GCN-NEXT: s_xor_b32 s4, s2, 0x1010101 +; GCN-NEXT: s_lshl_b64 s[6:7], 0xff, s6 +; GCN-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: s_xor_b64 s[2:3], s[4:5], s[2:3] ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v1, s1 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll index be16fac4c53f7..44bd4090436ef 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -1511,13 +1511,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_lshl_b32 s0, s3, 4 -; SI-NEXT: s_lshl_b32 s0, 0xffff, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_andn2_b32 s1, s2, s0 -; SI-NEXT: s_and_b32 s0, s0, 0x50005 -; SI-NEXT: s_or_b32 s0, s0, s1 +; SI-NEXT: s_lshl_b32 s1, s3, 4 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_xor_b32 s0, s2, 0x50005 +; SI-NEXT: s_lshl_b32 s1, 0xffff, s1 +; SI-NEXT: s_and_b32 s0, s0, s1 +; SI-NEXT: s_xor_b32 s0, s0, s2 ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -1528,13 +1528,13 @@ define amdgpu_kernel void @dynamic_insertelement_v2i16(ptr addrspace(1) %out, <2 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_lshl_b32 s0, s3, 4 -; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_andn2_b32 s1, s2, s0 -; VI-NEXT: s_and_b32 s0, s0, 0x50005 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshl_b32 s1, s3, 4 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_xor_b32 s0, s2, 0x50005 +; VI-NEXT: s_lshl_b32 s1, 0xffff, s1 +; VI-NEXT: s_and_b32 s0, s0, s1 +; VI-NEXT: s_xor_b32 s0, s0, s2 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm @@ -1552,13 +1552,13 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_lshl_b32 s0, s8, 4 +; SI-NEXT: s_lshl_b32 s8, s8, 4 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 -; SI-NEXT: s_and_b32 s9, s1, 0x50005 -; SI-NEXT: s_and_b32 s8, s0, 0x50005 -; SI-NEXT: s_andn2_b64 s[0:1], s[2:3], s[0:1] -; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; SI-NEXT: s_xor_b32 s1, s3, 0x50005 +; SI-NEXT: s_xor_b32 s0, s2, 0x50005 +; SI-NEXT: s_lshl_b64 s[8:9], 0xffff, s8 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s1 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; SI-NEXT: v_mov_b32_e32 v0, s0 @@ -1573,14 +1573,14 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_lshl_b32 s0, s8, 4 -; VI-NEXT: s_mov_b32 s8, 0x50005 +; VI-NEXT: s_mov_b32 s0, 0x50005 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshl_b64 s[0:1], 0xffff, s0 -; VI-NEXT: s_mov_b32 s9, s8 -; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] +; VI-NEXT: s_mov_b32 s1, s0 +; VI-NEXT: s_lshl_b32 s8, s8, 4 +; VI-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] +; VI-NEXT: s_lshl_b64 s[8:9], 0xffff, s8 ; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] -; VI-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4 ; VI-NEXT: v_mov_b32_e32 v0, s0 @@ -1594,35 +1594,34 @@ define amdgpu_kernel void @dynamic_insertelement_v3i16(ptr addrspace(1) %out, <3 define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 x i32], <2 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v2i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[8:9], 0x13 +; SI-NEXT: s_load_dword s4, s[8:9], 0xa +; SI-NEXT: s_load_dword s5, s[8:9], 0x13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; SI-NEXT: s_load_dword s5, s[8:9], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: s_lshl_b32 s4, 0xff, s4 -; SI-NEXT: s_andn2_b32 s5, s5, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x505 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_xor_b32 s6, s4, 0x505 +; SI-NEXT: s_lshl_b32 s5, s5, 3 +; SI-NEXT: s_lshl_b32 s5, 0xff, s5 +; SI-NEXT: s_and_b32 s5, s6, s5 +; SI-NEXT: s_xor_b32 s4, s5, s4 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v2i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[8:9], 0x4c +; VI-NEXT: s_load_dword s4, s[8:9], 0x28 +; VI-NEXT: s_load_dword s5, s[8:9], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: s_load_dword s5, s[8:9], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: s_lshl_b32 s4, 0xff, s4 -; VI-NEXT: s_and_b32 s6, s4, 0x505 -; VI-NEXT: s_xor_b32 s4, s4, 0xffff -; VI-NEXT: s_and_b32 s4, s4, s5 -; VI-NEXT: s_or_b32 s4, s6, s4 +; VI-NEXT: s_xor_b32 s6, s4, 0x505 +; VI-NEXT: s_lshl_b32 s5, s5, 3 +; VI-NEXT: s_lshl_b32 s5, 0xff, s5 +; VI-NEXT: s_and_b32 s5, s6, s5 +; VI-NEXT: s_xor_b32 s4, s5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1636,17 +1635,17 @@ define amdgpu_kernel void @dynamic_insertelement_v2i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 x i32], <3 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v3i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[8:9], 0x13 +; SI-NEXT: s_load_dword s4, s[8:9], 0xa +; SI-NEXT: s_load_dword s5, s[8:9], 0x13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; SI-NEXT: s_load_dword s5, s[8:9], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: s_lshl_b32 s4, 0xff, s4 -; SI-NEXT: s_andn2_b32 s5, s5, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x5050505 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_xor_b32 s6, s4, 0x5050505 +; SI-NEXT: s_lshl_b32 s5, s5, 3 +; SI-NEXT: s_lshl_b32 s5, 0xff, s5 +; SI-NEXT: s_and_b32 s5, s6, s5 +; SI-NEXT: s_xor_b32 s4, s5, s4 ; SI-NEXT: s_lshr_b32 s5, s4, 16 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1656,17 +1655,17 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 ; ; VI-LABEL: dynamic_insertelement_v3i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[8:9], 0x4c +; VI-NEXT: s_load_dword s4, s[8:9], 0x28 +; VI-NEXT: s_load_dword s5, s[8:9], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: s_load_dword s5, s[8:9], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: s_lshl_b32 s4, 0xff, s4 -; VI-NEXT: s_andn2_b32 s5, s5, s4 -; VI-NEXT: s_and_b32 s4, s4, 0x5050505 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_xor_b32 s6, s4, 0x5050505 +; VI-NEXT: s_lshl_b32 s5, s5, 3 +; VI-NEXT: s_lshl_b32 s5, 0xff, s5 +; VI-NEXT: s_and_b32 s5, s6, s5 +; VI-NEXT: s_xor_b32 s4, s5, s4 ; VI-NEXT: s_lshr_b32 s5, s4, 16 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 @@ -1681,34 +1680,34 @@ define amdgpu_kernel void @dynamic_insertelement_v3i8(ptr addrspace(1) %out, [8 define amdgpu_kernel void @dynamic_insertelement_v4i8(ptr addrspace(1) %out, [8 x i32], <4 x i8> %a, [8 x i32], i32 %b) nounwind { ; SI-LABEL: dynamic_insertelement_v4i8: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s4, s[8:9], 0x13 +; SI-NEXT: s_load_dword s4, s[8:9], 0xa +; SI-NEXT: s_load_dword s5, s[8:9], 0x13 ; SI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; SI-NEXT: s_load_dword s5, s[8:9], 0xa ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_lshl_b32 s4, s4, 3 -; SI-NEXT: s_lshl_b32 s4, 0xff, s4 -; SI-NEXT: s_andn2_b32 s5, s5, s4 -; SI-NEXT: s_and_b32 s4, s4, 0x5050505 -; SI-NEXT: s_or_b32 s4, s4, s5 +; SI-NEXT: s_xor_b32 s6, s4, 0x5050505 +; SI-NEXT: s_lshl_b32 s5, s5, 3 +; SI-NEXT: s_lshl_b32 s5, 0xff, s5 +; SI-NEXT: s_and_b32 s5, s6, s5 +; SI-NEXT: s_xor_b32 s4, s5, s4 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: dynamic_insertelement_v4i8: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s4, s[8:9], 0x4c +; VI-NEXT: s_load_dword s4, s[8:9], 0x28 +; VI-NEXT: s_load_dword s5, s[8:9], 0x4c ; VI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; VI-NEXT: s_load_dword s5, s[8:9], 0x28 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s4, s4, 3 -; VI-NEXT: s_lshl_b32 s4, 0xff, s4 -; VI-NEXT: s_andn2_b32 s5, s5, s4 -; VI-NEXT: s_and_b32 s4, s4, 0x5050505 -; VI-NEXT: s_or_b32 s4, s4, s5 +; VI-NEXT: s_xor_b32 s6, s4, 0x5050505 +; VI-NEXT: s_lshl_b32 s5, s5, 3 +; VI-NEXT: s_lshl_b32 s5, 0xff, s5 +; VI-NEXT: s_and_b32 s5, s6, s5 +; VI-NEXT: s_xor_b32 s4, s5, s4 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -1721,20 +1720,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p ; SI-LABEL: s_dynamic_insertelement_v8i8: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; SI-NEXT: s_load_dword s8, s[8:9], 0x4 ; SI-NEXT: s_mov_b32 s7, 0x100f000 ; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dword s8, s[8:9], 0x4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_lshl_b32 s0, s8, 3 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 -; SI-NEXT: s_and_b32 s9, s1, 0x5050505 +; SI-NEXT: s_lshl_b32 s8, s8, 3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] -; SI-NEXT: s_and_b32 s8, s0, 0x5050505 -; SI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] +; SI-NEXT: s_xor_b32 s1, s3, 0x5050505 +; SI-NEXT: s_xor_b32 s0, s2, 0x5050505 +; SI-NEXT: s_lshl_b64 s[8:9], 0xff, s8 +; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 @@ -1743,20 +1742,20 @@ define amdgpu_kernel void @s_dynamic_insertelement_v8i8(ptr addrspace(1) %out, p ; VI-LABEL: s_dynamic_insertelement_v8i8: ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0 -; VI-NEXT: s_load_dword s8, s[8:9], 0x10 ; VI-NEXT: s_mov_b32 s7, 0x1100f000 ; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_load_dword s8, s[8:9], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; VI-NEXT: s_mov_b32 s4, s0 -; VI-NEXT: s_lshl_b32 s0, s8, 3 ; VI-NEXT: s_mov_b32 s5, s1 -; VI-NEXT: s_lshl_b64 s[0:1], 0xff, s0 -; VI-NEXT: s_and_b32 s9, s1, 0x5050505 +; VI-NEXT: s_lshl_b32 s8, s8, 3 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] -; VI-NEXT: s_and_b32 s8, s0, 0x5050505 -; VI-NEXT: s_or_b64 s[0:1], s[8:9], s[2:3] +; VI-NEXT: s_xor_b32 s1, s3, 0x5050505 +; VI-NEXT: s_xor_b32 s0, s2, 0x5050505 +; VI-NEXT: s_lshl_b64 s[8:9], 0xff, s8 +; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[8:9] +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll index e0dacb7a59a42..a0ad6328b0c01 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.v2i16.ll @@ -1534,11 +1534,11 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; GFX9-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX9-NEXT: s_load_dword s7, s[2:3], 0x0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: s_lshl_b32 s2, s6, 4 -; GFX9-NEXT: s_lshl_b32 s2, 0xffff, s2 -; GFX9-NEXT: s_andn2_b32 s3, s7, s2 -; GFX9-NEXT: s_and_b32 s2, s2, 0x3e703e7 -; GFX9-NEXT: s_or_b32 s2, s2, s3 +; GFX9-NEXT: s_lshl_b32 s3, s6, 4 +; GFX9-NEXT: s_xor_b32 s2, s7, 0x3e703e7 +; GFX9-NEXT: s_lshl_b32 s3, 0xffff, s3 +; GFX9-NEXT: s_and_b32 s2, s2, s3 +; GFX9-NEXT: s_xor_b32 s2, s2, s7 ; GFX9-NEXT: v_mov_b32_e32 v1, s2 ; GFX9-NEXT: global_store_dword v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm @@ -1553,14 +1553,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s4, s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[2:3], 0x0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshl_b32 s0, s4, 4 -; VI-NEXT: s_lshl_b32 s0, 0xffff, s0 -; VI-NEXT: s_andn2_b32 s1, s2, s0 -; VI-NEXT: s_and_b32 s0, s0, 0x3e703e7 -; VI-NEXT: s_or_b32 s0, s0, s1 +; VI-NEXT: s_lshl_b32 s1, s4, 4 +; VI-NEXT: s_xor_b32 s0, s2, 0x3e703e7 +; VI-NEXT: s_lshl_b32 s1, 0xffff, s1 +; VI-NEXT: s_and_b32 s0, s0, s1 +; VI-NEXT: s_xor_b32 s0, s0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -1575,14 +1575,14 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: s_load_dword s4, s[4:5], 0x0 ; CI-NEXT: s_load_dword s2, s[2:3], 0x0 -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: s_waitcnt lgkmcnt(0) -; CI-NEXT: s_lshl_b32 s0, s4, 4 -; CI-NEXT: s_lshl_b32 s0, 0xffff, s0 -; CI-NEXT: s_andn2_b32 s1, s2, s0 -; CI-NEXT: s_and_b32 s0, s0, 0x3e703e7 -; CI-NEXT: s_or_b32 s0, s0, s1 +; CI-NEXT: s_lshl_b32 s1, s4, 4 +; CI-NEXT: s_xor_b32 s0, s2, 0x3e703e7 +; CI-NEXT: s_lshl_b32 s1, 0xffff, s1 +; CI-NEXT: s_and_b32 s0, s0, s1 +; CI-NEXT: s_xor_b32 s0, s0, s2 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: flat_store_dword v[0:1], v2 ; CI-NEXT: s_endpgm @@ -1597,12 +1597,12 @@ define amdgpu_kernel void @s_insertelement_v2i16_dynamic(ptr addrspace(1) %out, ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b32 s3, s4, 4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_xor_b32 s4, s2, 0x3e703e7 ; GFX11-NEXT: s_lshl_b32 s3, 0xffff, s3 -; GFX11-NEXT: s_and_not1_b32 s2, s2, s3 -; GFX11-NEXT: s_and_b32 s3, s3, 0x3e703e7 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s3, s2 +; GFX11-NEXT: s_and_b32 s3, s4, s3 +; GFX11-NEXT: s_xor_b32 s2, s3, s2 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll index 69724aa75af4f..321b64510c35f 100644 --- a/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/AMDGPU/unfold-masked-merge-scalar-variablemask.ll @@ -5,10 +5,11 @@ define i32 @s_out32(i32 inreg %x, i32 inreg %y, i32 inreg %mask) { ; GCN-LABEL: s_out32: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_and_b32 s0, s0, s2 -; GCN-NEXT: s_and_not1_b32 s1, s1, s2 +; GCN-NEXT: s_xor_b32 s0, s0, s1 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s0, s0, s2 +; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %mx = and i32 %x, %mask @@ -22,10 +23,11 @@ define i64 @s_out64(i64 inreg %x, i64 inreg %y, i64 inreg %mask) { ; GCN-LABEL: s_out64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[16:17] -; GCN-NEXT: s_and_not1_b64 s[2:3], s[2:3], s[16:17] +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], s[16:17] +; GCN-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GCN-NEXT: s_setpc_b64 s[30:31] %mx = and i64 %x, %mask @@ -427,10 +429,11 @@ define i32 @s_out_constant_varx_42(i32 inreg %x, i32 inreg %y, i32 inreg %mask) ; GCN-LABEL: s_out_constant_varx_42: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_and_b32 s0, s2, s0 -; GCN-NEXT: s_and_not1_b32 s1, 42, s2 +; GCN-NEXT: s_xor_b32 s0, s0, 42 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s0, s0, s2 +; GCN-NEXT: s_xor_b32 s0, s0, 42 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %notmask = xor i32 %mask, -1 @@ -462,10 +465,11 @@ define i32 @s_out_constant_varx_42_invmask(i32 inreg %x, i32 inreg %y, i32 inreg ; GCN-LABEL: s_out_constant_varx_42_invmask: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_and_not1_b32 s0, s0, s2 -; GCN-NEXT: s_and_b32 s1, s2, 42 +; GCN-NEXT: s_xor_b32 s1, s0, 42 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s1, s1, s2 +; GCN-NEXT: s_xor_b32 s0, s1, s0 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %notmask = xor i32 %mask, -1 @@ -560,10 +564,11 @@ define i32 @s_out_constant_42_vary(i32 inreg %x, i32 inreg %y, i32 inreg %mask) ; GCN-LABEL: s_out_constant_42_vary: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_and_b32 s0, s2, 42 -; GCN-NEXT: s_and_not1_b32 s1, s1, s2 +; GCN-NEXT: s_xor_b32 s0, s1, 42 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s0, s0, s2 +; GCN-NEXT: s_xor_b32 s0, s0, s1 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %notmask = xor i32 %mask, -1 @@ -595,10 +600,11 @@ define i32 @s_out_constant_42_vary_invmask(i32 inreg %x, i32 inreg %y, i32 inreg ; GCN-LABEL: s_out_constant_42_vary_invmask: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_and_not1_b32 s0, 42, s2 -; GCN-NEXT: s_and_b32 s1, s2, s1 +; GCN-NEXT: s_xor_b32 s0, s1, 42 ; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GCN-NEXT: s_or_b32 s0, s0, s1 +; GCN-NEXT: s_and_b32 s0, s0, s2 +; GCN-NEXT: s_xor_b32 s0, s0, 42 +; GCN-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: s_setpc_b64 s[30:31] %notmask = xor i32 %mask, -1 diff --git a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll index 8e4c77e76029c..bac8bbbf0b4de 100644 --- a/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/NVPTX/unfold-masked-merge-vector-variablemask.ll @@ -8,17 +8,16 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-LABEL: out_v1i8( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<8>; +; CHECK-NEXT: .reg .b16 %rs<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b8 %rs1, [out_v1i8_param_0]; -; CHECK-NEXT: ld.param.b8 %rs2, [out_v1i8_param_2]; -; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2; -; CHECK-NEXT: ld.param.b8 %rs4, [out_v1i8_param_1]; -; CHECK-NEXT: not.b16 %rs5, %rs2; -; CHECK-NEXT: and.b16 %rs6, %rs4, %rs5; -; CHECK-NEXT: or.b16 %rs7, %rs3, %rs6; -; CHECK-NEXT: st.param.b8 [func_retval0], %rs7; +; CHECK-NEXT: ld.param.b8 %rs2, [out_v1i8_param_1]; +; CHECK-NEXT: ld.param.b8 %rs3, [out_v1i8_param_2]; +; CHECK-NEXT: xor.b16 %rs4, %rs1, %rs2; +; CHECK-NEXT: and.b16 %rs5, %rs4, %rs3; +; CHECK-NEXT: xor.b16 %rs6, %rs5, %rs2; +; CHECK-NEXT: st.param.b8 [func_retval0], %rs6; ; CHECK-NEXT: ret; %mx = and <1 x i8> %x, %mask %notmask = xor <1 x i8> %mask, @@ -34,17 +33,16 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { ; CHECK-LABEL: out_v1i16( ; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<8>; +; CHECK-NEXT: .reg .b16 %rs<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b16 %rs1, [out_v1i16_param_0]; -; CHECK-NEXT: ld.param.b16 %rs2, [out_v1i16_param_2]; -; CHECK-NEXT: and.b16 %rs3, %rs1, %rs2; -; CHECK-NEXT: ld.param.b16 %rs4, [out_v1i16_param_1]; -; CHECK-NEXT: not.b16 %rs5, %rs2; -; CHECK-NEXT: and.b16 %rs6, %rs4, %rs5; -; CHECK-NEXT: or.b16 %rs7, %rs3, %rs6; -; CHECK-NEXT: st.param.b16 [func_retval0], %rs7; +; CHECK-NEXT: ld.param.b16 %rs2, [out_v1i16_param_1]; +; CHECK-NEXT: ld.param.b16 %rs3, [out_v1i16_param_2]; +; CHECK-NEXT: xor.b16 %rs4, %rs1, %rs2; +; CHECK-NEXT: and.b16 %rs5, %rs4, %rs3; +; CHECK-NEXT: xor.b16 %rs6, %rs5, %rs2; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs6; ; CHECK-NEXT: ret; %mx = and <1 x i16> %x, %mask %notmask = xor <1 x i16> %mask, @@ -126,17 +124,16 @@ define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwin define <1 x i32> @out_v1i32(<1 x i32> %x, <1 x i32> %y, <1 x i32> %mask) nounwind { ; CHECK-LABEL: out_v1i32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b32 %r<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b32 %r1, [out_v1i32_param_0]; -; CHECK-NEXT: ld.param.b32 %r2, [out_v1i32_param_2]; -; CHECK-NEXT: and.b32 %r3, %r1, %r2; -; CHECK-NEXT: ld.param.b32 %r4, [out_v1i32_param_1]; -; CHECK-NEXT: not.b32 %r5, %r2; -; CHECK-NEXT: and.b32 %r6, %r4, %r5; -; CHECK-NEXT: or.b32 %r7, %r3, %r6; -; CHECK-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NEXT: ld.param.b32 %r2, [out_v1i32_param_1]; +; CHECK-NEXT: ld.param.b32 %r3, [out_v1i32_param_2]; +; CHECK-NEXT: xor.b32 %r4, %r1, %r2; +; CHECK-NEXT: and.b32 %r5, %r4, %r3; +; CHECK-NEXT: xor.b32 %r6, %r5, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r6; ; CHECK-NEXT: ret; %mx = and <1 x i32> %x, %mask %notmask = xor <1 x i32> %mask, @@ -230,21 +227,19 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwind { ; CHECK-LABEL: out_v2i32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b32 %r<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.b32 {%r1, %r2}, [out_v2i32_param_0]; -; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_2]; -; CHECK-NEXT: and.b32 %r5, %r1, %r3; -; CHECK-NEXT: and.b32 %r6, %r2, %r4; -; CHECK-NEXT: ld.param.v2.b32 {%r7, %r8}, [out_v2i32_param_1]; -; CHECK-NEXT: not.b32 %r9, %r4; -; CHECK-NEXT: not.b32 %r10, %r3; -; CHECK-NEXT: and.b32 %r11, %r7, %r10; -; CHECK-NEXT: and.b32 %r12, %r8, %r9; -; CHECK-NEXT: or.b32 %r13, %r6, %r12; -; CHECK-NEXT: or.b32 %r14, %r5, %r11; -; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r14, %r13}; +; CHECK-NEXT: ld.param.v2.b32 {%r3, %r4}, [out_v2i32_param_1]; +; CHECK-NEXT: ld.param.v2.b32 {%r5, %r6}, [out_v2i32_param_2]; +; CHECK-NEXT: xor.b32 %r7, %r2, %r4; +; CHECK-NEXT: and.b32 %r8, %r7, %r6; +; CHECK-NEXT: xor.b32 %r9, %r8, %r4; +; CHECK-NEXT: xor.b32 %r10, %r1, %r3; +; CHECK-NEXT: and.b32 %r11, %r10, %r5; +; CHECK-NEXT: xor.b32 %r12, %r11, %r3; +; CHECK-NEXT: st.param.v2.b32 [func_retval0], {%r12, %r9}; ; CHECK-NEXT: ret; %mx = and <2 x i32> %x, %mask %notmask = xor <2 x i32> %mask, @@ -256,17 +251,16 @@ define <2 x i32> @out_v2i32(<2 x i32> %x, <2 x i32> %y, <2 x i32> %mask) nounwin define <1 x i64> @out_v1i64(<1 x i64> %x, <1 x i64> %y, <1 x i64> %mask) nounwind { ; CHECK-LABEL: out_v1i64( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<8>; +; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.b64 %rd1, [out_v1i64_param_0]; -; CHECK-NEXT: ld.param.b64 %rd2, [out_v1i64_param_2]; -; CHECK-NEXT: and.b64 %rd3, %rd1, %rd2; -; CHECK-NEXT: ld.param.b64 %rd4, [out_v1i64_param_1]; -; CHECK-NEXT: not.b64 %rd5, %rd2; -; CHECK-NEXT: and.b64 %rd6, %rd4, %rd5; -; CHECK-NEXT: or.b64 %rd7, %rd3, %rd6; -; CHECK-NEXT: st.param.b64 [func_retval0], %rd7; +; CHECK-NEXT: ld.param.b64 %rd2, [out_v1i64_param_1]; +; CHECK-NEXT: ld.param.b64 %rd3, [out_v1i64_param_2]; +; CHECK-NEXT: xor.b64 %rd4, %rd1, %rd2; +; CHECK-NEXT: and.b64 %rd5, %rd4, %rd3; +; CHECK-NEXT: xor.b64 %rd6, %rd5, %rd2; +; CHECK-NEXT: st.param.b64 [func_retval0], %rd6; ; CHECK-NEXT: ret; %mx = and <1 x i64> %x, %mask %notmask = xor <1 x i64> %mask, @@ -350,29 +344,25 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { ; CHECK-LABEL: out_v4i32( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<29>; +; CHECK-NEXT: .reg .b32 %r<25>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_param_0]; -; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_2]; -; CHECK-NEXT: and.b32 %r9, %r1, %r5; -; CHECK-NEXT: and.b32 %r10, %r2, %r6; -; CHECK-NEXT: and.b32 %r11, %r3, %r7; -; CHECK-NEXT: and.b32 %r12, %r4, %r8; -; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_param_1]; -; CHECK-NEXT: not.b32 %r17, %r8; -; CHECK-NEXT: not.b32 %r18, %r7; -; CHECK-NEXT: not.b32 %r19, %r6; -; CHECK-NEXT: not.b32 %r20, %r5; -; CHECK-NEXT: and.b32 %r21, %r13, %r20; -; CHECK-NEXT: and.b32 %r22, %r14, %r19; -; CHECK-NEXT: and.b32 %r23, %r15, %r18; -; CHECK-NEXT: and.b32 %r24, %r16, %r17; -; CHECK-NEXT: or.b32 %r25, %r12, %r24; -; CHECK-NEXT: or.b32 %r26, %r11, %r23; -; CHECK-NEXT: or.b32 %r27, %r10, %r22; -; CHECK-NEXT: or.b32 %r28, %r9, %r21; -; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r28, %r27, %r26, %r25}; +; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_param_1]; +; CHECK-NEXT: ld.param.v4.b32 {%r9, %r10, %r11, %r12}, [out_v4i32_param_2]; +; CHECK-NEXT: xor.b32 %r13, %r4, %r8; +; CHECK-NEXT: and.b32 %r14, %r13, %r12; +; CHECK-NEXT: xor.b32 %r15, %r14, %r8; +; CHECK-NEXT: xor.b32 %r16, %r3, %r7; +; CHECK-NEXT: and.b32 %r17, %r16, %r11; +; CHECK-NEXT: xor.b32 %r18, %r17, %r7; +; CHECK-NEXT: xor.b32 %r19, %r2, %r6; +; CHECK-NEXT: and.b32 %r20, %r19, %r10; +; CHECK-NEXT: xor.b32 %r21, %r20, %r6; +; CHECK-NEXT: xor.b32 %r22, %r1, %r5; +; CHECK-NEXT: and.b32 %r23, %r22, %r9; +; CHECK-NEXT: xor.b32 %r24, %r23, %r5; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r24, %r21, %r18, %r15}; ; CHECK-NEXT: ret; %mx = and <4 x i32> %x, %mask %notmask = xor <4 x i32> %mask, @@ -384,26 +374,23 @@ define <4 x i32> @out_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwin define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) nounwind { ; CHECK-LABEL: out_v4i32_undef( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<26>; +; CHECK-NEXT: .reg .b32 %r<23>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v4.b32 {%r1, %r2, %r3, %r4}, [out_v4i32_undef_param_0]; ; CHECK-NEXT: ld.param.v4.b32 {%r5, %r6, %r7, %r8}, [out_v4i32_undef_param_2]; ; CHECK-NEXT: and.b32 %r9, %r3, %r7; -; CHECK-NEXT: and.b32 %r10, %r1, %r5; -; CHECK-NEXT: and.b32 %r11, %r2, %r6; -; CHECK-NEXT: and.b32 %r12, %r4, %r8; -; CHECK-NEXT: ld.param.v4.b32 {%r13, %r14, %r15, %r16}, [out_v4i32_undef_param_1]; -; CHECK-NEXT: not.b32 %r17, %r8; -; CHECK-NEXT: not.b32 %r18, %r6; -; CHECK-NEXT: not.b32 %r19, %r5; -; CHECK-NEXT: and.b32 %r20, %r13, %r19; -; CHECK-NEXT: and.b32 %r21, %r14, %r18; -; CHECK-NEXT: and.b32 %r22, %r16, %r17; -; CHECK-NEXT: or.b32 %r23, %r12, %r22; -; CHECK-NEXT: or.b32 %r24, %r11, %r21; -; CHECK-NEXT: or.b32 %r25, %r10, %r20; -; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r25, %r24, %r9, %r23}; +; CHECK-NEXT: ld.param.v4.b32 {%r10, %r11, %r12, %r13}, [out_v4i32_undef_param_1]; +; CHECK-NEXT: xor.b32 %r14, %r4, %r13; +; CHECK-NEXT: and.b32 %r15, %r14, %r8; +; CHECK-NEXT: xor.b32 %r16, %r15, %r13; +; CHECK-NEXT: xor.b32 %r17, %r2, %r11; +; CHECK-NEXT: and.b32 %r18, %r17, %r6; +; CHECK-NEXT: xor.b32 %r19, %r18, %r11; +; CHECK-NEXT: xor.b32 %r20, %r1, %r10; +; CHECK-NEXT: and.b32 %r21, %r20, %r5; +; CHECK-NEXT: xor.b32 %r22, %r21, %r10; +; CHECK-NEXT: st.param.v4.b32 [func_retval0], {%r22, %r19, %r9, %r16}; ; CHECK-NEXT: ret; %mx = and <4 x i32> %x, %mask %notmask = xor <4 x i32> %mask, @@ -415,21 +402,19 @@ define <4 x i32> @out_v4i32_undef(<4 x i32> %x, <4 x i32> %y, <4 x i32> %mask) n define <2 x i64> @out_v2i64(<2 x i64> %x, <2 x i64> %y, <2 x i64> %mask) nounwind { ; CHECK-LABEL: out_v2i64( ; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<15>; +; CHECK-NEXT: .reg .b64 %rd<13>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.v2.b64 {%rd1, %rd2}, [out_v2i64_param_0]; -; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_2]; -; CHECK-NEXT: and.b64 %rd5, %rd1, %rd3; -; CHECK-NEXT: and.b64 %rd6, %rd2, %rd4; -; CHECK-NEXT: ld.param.v2.b64 {%rd7, %rd8}, [out_v2i64_param_1]; -; CHECK-NEXT: not.b64 %rd9, %rd4; -; CHECK-NEXT: not.b64 %rd10, %rd3; -; CHECK-NEXT: and.b64 %rd11, %rd7, %rd10; -; CHECK-NEXT: and.b64 %rd12, %rd8, %rd9; -; CHECK-NEXT: or.b64 %rd13, %rd6, %rd12; -; CHECK-NEXT: or.b64 %rd14, %rd5, %rd11; -; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd14, %rd13}; +; CHECK-NEXT: ld.param.v2.b64 {%rd3, %rd4}, [out_v2i64_param_1]; +; CHECK-NEXT: ld.param.v2.b64 {%rd5, %rd6}, [out_v2i64_param_2]; +; CHECK-NEXT: xor.b64 %rd7, %rd2, %rd4; +; CHECK-NEXT: and.b64 %rd8, %rd7, %rd6; +; CHECK-NEXT: xor.b64 %rd9, %rd8, %rd4; +; CHECK-NEXT: xor.b64 %rd10, %rd1, %rd3; +; CHECK-NEXT: and.b64 %rd11, %rd10, %rd5; +; CHECK-NEXT: xor.b64 %rd12, %rd11, %rd3; +; CHECK-NEXT: st.param.v2.b64 [func_retval0], {%rd12, %rd9}; ; CHECK-NEXT: ret; %mx = and <2 x i64> %x, %mask %notmask = xor <2 x i64> %mask, diff --git a/llvm/test/CodeGen/RISCV/fold-masked-merge.ll b/llvm/test/CodeGen/RISCV/fold-masked-merge.ll new file mode 100644 index 0000000000000..631b7109281e5 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/fold-masked-merge.ll @@ -0,0 +1,302 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV32,RV32I +; RUN: llc -mtriple=riscv64 < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-I,RV64,RV64I +; RUN: llc -mtriple=riscv32 -mattr=+zbb < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV32,RV32ZBB +; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \ +; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB +; +; test that masked-merge code is generated as "xor;and;xor" sequence or +; "andn ; and; or" if and-not is available. + +define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) { +; CHECK-I-LABEL: masked_merge0: +; CHECK-I: # %bb.0: +; CHECK-I-NEXT: xor a1, a1, a2 +; CHECK-I-NEXT: and a0, a1, a0 +; CHECK-I-NEXT: xor a0, a0, a2 +; CHECK-I-NEXT: ret +; +; CHECK-ZBB-LABEL: masked_merge0: +; CHECK-ZBB: # %bb.0: +; CHECK-ZBB-NEXT: and a1, a0, a1 +; CHECK-ZBB-NEXT: andn a0, a2, a0 +; CHECK-ZBB-NEXT: or a0, a1, a0 +; CHECK-ZBB-NEXT: ret + %and0 = and i32 %a0, %a1 + %not = xor i32 %a0, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %and0, %and1 + ret i32 %or +} + +define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) { +; CHECK-I-LABEL: masked_merge1: +; CHECK-I: # %bb.0: +; CHECK-I-NEXT: xor a1, a1, a2 +; CHECK-I-NEXT: and a0, a1, a0 +; CHECK-I-NEXT: xor a0, a0, a2 +; CHECK-I-NEXT: ret +; +; CHECK-ZBB-LABEL: masked_merge1: +; CHECK-ZBB: # %bb.0: +; CHECK-ZBB-NEXT: and a1, a0, a1 +; CHECK-ZBB-NEXT: andn a0, a2, a0 +; CHECK-ZBB-NEXT: or a0, a1, a0 +; CHECK-ZBB-NEXT: ret + %and0 = and i16 %a0, %a1 + %not = xor i16 %a0, -1 + %and1 = and i16 %a2, %not + %or = or i16 %and0, %and1 + ret i16 %or +} + +define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) { +; CHECK-I-LABEL: masked_merge2: +; CHECK-I: # %bb.0: +; CHECK-I-NEXT: mv a0, a1 +; CHECK-I-NEXT: ret +; +; CHECK-ZBB-LABEL: masked_merge2: +; CHECK-ZBB: # %bb.0: +; CHECK-ZBB-NEXT: andn a2, a1, a0 +; CHECK-ZBB-NEXT: and a0, a1, a0 +; CHECK-ZBB-NEXT: or a0, a2, a0 +; CHECK-ZBB-NEXT: ret + %not = xor i8 %a0, -1 + %and0 = and i8 %not, %a1 + %and1 = and i8 %a1, %a0 + %or = or i8 %and0, %and1 + ret i8 %or +} + +define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) { +; RV32I-LABEL: masked_merge3: +; RV32I: # %bb.0: +; RV32I-NEXT: not a5, a5 +; RV32I-NEXT: not a4, a4 +; RV32I-NEXT: xor a3, a3, a5 +; RV32I-NEXT: xor a2, a2, a4 +; RV32I-NEXT: not a2, a2 +; RV32I-NEXT: not a3, a3 +; RV32I-NEXT: and a0, a2, a0 +; RV32I-NEXT: and a1, a3, a1 +; RV32I-NEXT: xor a0, a0, a4 +; RV32I-NEXT: xor a1, a1, a5 +; RV32I-NEXT: ret +; +; RV64I-LABEL: masked_merge3: +; RV64I: # %bb.0: +; RV64I-NEXT: not a2, a2 +; RV64I-NEXT: xor a1, a1, a2 +; RV64I-NEXT: not a1, a1 +; RV64I-NEXT: and a0, a1, a0 +; RV64I-NEXT: xor a0, a0, a2 +; RV64I-NEXT: ret +; +; RV32ZBB-LABEL: masked_merge3: +; RV32ZBB: # %bb.0: +; RV32ZBB-NEXT: not a6, a0 +; RV32ZBB-NEXT: not a7, a1 +; RV32ZBB-NEXT: andn a1, a1, a3 +; RV32ZBB-NEXT: andn a0, a0, a2 +; RV32ZBB-NEXT: andn a2, a7, a5 +; RV32ZBB-NEXT: andn a3, a6, a4 +; RV32ZBB-NEXT: or a0, a3, a0 +; RV32ZBB-NEXT: or a1, a2, a1 +; RV32ZBB-NEXT: ret +; +; RV64ZBB-LABEL: masked_merge3: +; RV64ZBB: # %bb.0: +; RV64ZBB-NEXT: not a3, a0 +; RV64ZBB-NEXT: andn a2, a3, a2 +; RV64ZBB-NEXT: andn a0, a0, a1 +; RV64ZBB-NEXT: or a0, a2, a0 +; RV64ZBB-NEXT: ret + %v0 = xor i64 %a1, -1 + %v1 = xor i64 %a2, -1 + %not = xor i64 %a0, -1 + %and0 = and i64 %not, %v1 + %and1 = and i64 %v0, %a0 + %or = or i64 %and0, %and1 + ret i64 %or +} + +define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) { +; RV32-LABEL: not_a_masked_merge0: +; RV32: # %bb.0: +; RV32-NEXT: and a1, a0, a1 +; RV32-NEXT: neg a0, a0 +; RV32-NEXT: and a0, a0, a2 +; RV32-NEXT: or a0, a1, a0 +; RV32-NEXT: ret +; +; RV64-LABEL: not_a_masked_merge0: +; RV64: # %bb.0: +; RV64-NEXT: and a1, a0, a1 +; RV64-NEXT: negw a0, a0 +; RV64-NEXT: and a0, a0, a2 +; RV64-NEXT: or a0, a1, a0 +; RV64-NEXT: ret + %and0 = and i32 %a0, %a1 + %not_a_not = sub i32 0, %a0 + %and1 = and i32 %not_a_not, %a2 + %or = or i32 %and0, %and1 + ret i32 %or +} + +define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { +; CHECK-I-LABEL: not_a_masked_merge1: +; CHECK-I: # %bb.0: +; CHECK-I-NEXT: and a0, a0, a1 +; CHECK-I-NEXT: not a1, a3 +; CHECK-I-NEXT: and a1, a1, a2 +; CHECK-I-NEXT: or a0, a0, a1 +; CHECK-I-NEXT: ret +; +; CHECK-ZBB-LABEL: not_a_masked_merge1: +; CHECK-ZBB: # %bb.0: +; CHECK-ZBB-NEXT: and a0, a0, a1 +; CHECK-ZBB-NEXT: andn a1, a2, a3 +; CHECK-ZBB-NEXT: or a0, a0, a1 +; CHECK-ZBB-NEXT: ret + %and0 = and i32 %a0, %a1 + %not = xor i32 %a3, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %and0, %and1 + ret i32 %or +} + +define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) { +; CHECK-I-LABEL: not_a_masked_merge2: +; CHECK-I: # %bb.0: +; CHECK-I-NEXT: or a1, a0, a1 +; CHECK-I-NEXT: not a0, a0 +; CHECK-I-NEXT: and a0, a0, a2 +; CHECK-I-NEXT: or a0, a1, a0 +; CHECK-I-NEXT: ret +; +; CHECK-ZBB-LABEL: not_a_masked_merge2: +; CHECK-ZBB: # %bb.0: +; CHECK-ZBB-NEXT: or a1, a0, a1 +; CHECK-ZBB-NEXT: andn a0, a2, a0 +; CHECK-ZBB-NEXT: or a0, a1, a0 +; CHECK-ZBB-NEXT: ret + %not_an_and0 = or i32 %a0, %a1 + %not = xor i32 %a0, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %not_an_and0, %and1 + ret i32 %or +} + +define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) { +; CHECK-I-LABEL: not_a_masked_merge3: +; CHECK-I: # %bb.0: +; CHECK-I-NEXT: and a1, a0, a1 +; CHECK-I-NEXT: xor a0, a0, a2 +; CHECK-I-NEXT: not a0, a0 +; CHECK-I-NEXT: or a0, a1, a0 +; CHECK-I-NEXT: ret +; +; CHECK-ZBB-LABEL: not_a_masked_merge3: +; CHECK-ZBB: # %bb.0: +; CHECK-ZBB-NEXT: and a1, a0, a1 +; CHECK-ZBB-NEXT: xor a0, a0, a2 +; CHECK-ZBB-NEXT: orn a0, a1, a0 +; CHECK-ZBB-NEXT: ret + %and0 = and i32 %a0, %a1 + %not = xor i32 %a0, -1 + %not_an_and1 = xor i32 %not, %a2 + %or = or i32 %and0, %not_an_and1 + ret i32 %or +} + +define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) { +; CHECK-LABEL: not_a_masked_merge4: +; CHECK: # %bb.0: +; CHECK-NEXT: and a0, a0, a1 +; CHECK-NEXT: ret + %and0 = and i32 %a0, %a1 + %not = xor i32 %a2, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %and0, %and1 + ret i32 %or +} + +define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) { +; CHECK-I-LABEL: masked_merge_no_transform0: +; CHECK-I: # %bb.0: +; CHECK-I-NEXT: and a1, a0, a1 +; CHECK-I-NEXT: not a0, a0 +; CHECK-I-NEXT: and a0, a0, a2 +; CHECK-I-NEXT: or a0, a1, a0 +; CHECK-I-NEXT: sw a1, 0(a3) +; CHECK-I-NEXT: ret +; +; CHECK-ZBB-LABEL: masked_merge_no_transform0: +; CHECK-ZBB: # %bb.0: +; CHECK-ZBB-NEXT: and a1, a0, a1 +; CHECK-ZBB-NEXT: andn a0, a2, a0 +; CHECK-ZBB-NEXT: or a0, a1, a0 +; CHECK-ZBB-NEXT: sw a1, 0(a3) +; CHECK-ZBB-NEXT: ret + %and0 = and i32 %a0, %a1 + %not = xor i32 %a0, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %and0, %and1 + store i32 %and0, ptr %p1 + ret i32 %or +} + +define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) { +; CHECK-I-LABEL: masked_merge_no_transform1: +; CHECK-I: # %bb.0: +; CHECK-I-NEXT: and a1, a0, a1 +; CHECK-I-NEXT: not a4, a0 +; CHECK-I-NEXT: and a0, a4, a2 +; CHECK-I-NEXT: or a0, a1, a0 +; CHECK-I-NEXT: sw a4, 0(a3) +; CHECK-I-NEXT: ret +; +; CHECK-ZBB-LABEL: masked_merge_no_transform1: +; CHECK-ZBB: # %bb.0: +; CHECK-ZBB-NEXT: and a1, a0, a1 +; CHECK-ZBB-NEXT: not a4, a0 +; CHECK-ZBB-NEXT: andn a0, a2, a0 +; CHECK-ZBB-NEXT: or a0, a1, a0 +; CHECK-ZBB-NEXT: sw a4, 0(a3) +; CHECK-ZBB-NEXT: ret + %and0 = and i32 %a0, %a1 + %not = xor i32 %a0, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %and0, %and1 + store i32 %not, ptr %p1 + ret i32 %or +} + +define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) { +; CHECK-I-LABEL: masked_merge_no_transform2: +; CHECK-I: # %bb.0: +; CHECK-I-NEXT: and a1, a0, a1 +; CHECK-I-NEXT: not a0, a0 +; CHECK-I-NEXT: and a2, a0, a2 +; CHECK-I-NEXT: or a0, a1, a2 +; CHECK-I-NEXT: sw a2, 0(a3) +; CHECK-I-NEXT: ret +; +; CHECK-ZBB-LABEL: masked_merge_no_transform2: +; CHECK-ZBB: # %bb.0: +; CHECK-ZBB-NEXT: and a1, a0, a1 +; CHECK-ZBB-NEXT: andn a2, a2, a0 +; CHECK-ZBB-NEXT: or a0, a1, a2 +; CHECK-ZBB-NEXT: sw a2, 0(a3) +; CHECK-ZBB-NEXT: ret + %and0 = and i32 %a0, %a1 + %not = xor i32 %a0, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %and0, %and1 + store i32 %and1, ptr %p1 + ret i32 %or +} diff --git a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll index 1517e524a7f78..efc8243df71e0 100644 --- a/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/RISCV/unfold-masked-merge-scalar-variablemask.ll @@ -8,16 +8,13 @@ ; RUN: llc -mtriple=riscv64 -mattr=+zbb < %s \ ; RUN: | FileCheck %s --check-prefixes=CHECK,CHECK-ZBB,RV64,RV64ZBB -; TODO: Should we convert these to X ^ ((X ^ Y) & M) form when Zbb isn't -; present? define i8 @out8(i8 %x, i8 %y, i8 %mask) { ; CHECK-I-LABEL: out8: ; CHECK-I: # %bb.0: +; CHECK-I-NEXT: xor a0, a0, a1 ; CHECK-I-NEXT: and a0, a0, a2 -; CHECK-I-NEXT: not a2, a2 -; CHECK-I-NEXT: and a1, a1, a2 -; CHECK-I-NEXT: or a0, a0, a1 +; CHECK-I-NEXT: xor a0, a0, a1 ; CHECK-I-NEXT: ret ; ; CHECK-ZBB-LABEL: out8: @@ -36,10 +33,9 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) { define i16 @out16(i16 %x, i16 %y, i16 %mask) { ; CHECK-I-LABEL: out16: ; CHECK-I: # %bb.0: +; CHECK-I-NEXT: xor a0, a0, a1 ; CHECK-I-NEXT: and a0, a0, a2 -; CHECK-I-NEXT: not a2, a2 -; CHECK-I-NEXT: and a1, a1, a2 -; CHECK-I-NEXT: or a0, a0, a1 +; CHECK-I-NEXT: xor a0, a0, a1 ; CHECK-I-NEXT: ret ; ; CHECK-ZBB-LABEL: out16: @@ -58,10 +54,9 @@ define i16 @out16(i16 %x, i16 %y, i16 %mask) { define i32 @out32(i32 %x, i32 %y, i32 %mask) { ; CHECK-I-LABEL: out32: ; CHECK-I: # %bb.0: +; CHECK-I-NEXT: xor a0, a0, a1 ; CHECK-I-NEXT: and a0, a0, a2 -; CHECK-I-NEXT: not a2, a2 -; CHECK-I-NEXT: and a1, a1, a2 -; CHECK-I-NEXT: or a0, a0, a1 +; CHECK-I-NEXT: xor a0, a0, a1 ; CHECK-I-NEXT: ret ; ; CHECK-ZBB-LABEL: out32: @@ -80,22 +75,19 @@ define i32 @out32(i32 %x, i32 %y, i32 %mask) { define i64 @out64(i64 %x, i64 %y, i64 %mask) { ; RV32I-LABEL: out64: ; RV32I: # %bb.0: -; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: xor a0, a0, a2 +; RV32I-NEXT: xor a1, a1, a3 ; RV32I-NEXT: and a0, a0, a4 -; RV32I-NEXT: not a4, a4 -; RV32I-NEXT: not a5, a5 -; RV32I-NEXT: and a3, a3, a5 -; RV32I-NEXT: and a2, a2, a4 -; RV32I-NEXT: or a0, a0, a2 -; RV32I-NEXT: or a1, a1, a3 +; RV32I-NEXT: and a1, a1, a5 +; RV32I-NEXT: xor a0, a0, a2 +; RV32I-NEXT: xor a1, a1, a3 ; RV32I-NEXT: ret ; ; RV64I-LABEL: out64: ; RV64I: # %bb.0: +; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: and a0, a0, a2 -; RV64I-NEXT: not a2, a2 -; RV64I-NEXT: and a1, a1, a2 -; RV64I-NEXT: or a0, a0, a1 +; RV64I-NEXT: xor a0, a0, a1 ; RV64I-NEXT: ret ; ; RV32ZBB-LABEL: out64: @@ -660,10 +652,9 @@ define i32 @in_constant_varx_mone_invmask(i32 %x, i32 %y, i32 %mask) { define i32 @out_constant_varx_42(i32 %x, i32 %y, i32 %mask) { ; CHECK-I-LABEL: out_constant_varx_42: ; CHECK-I: # %bb.0: -; CHECK-I-NEXT: not a1, a2 -; CHECK-I-NEXT: and a0, a2, a0 -; CHECK-I-NEXT: andi a1, a1, 42 -; CHECK-I-NEXT: or a0, a0, a1 +; CHECK-I-NEXT: xori a0, a0, 42 +; CHECK-I-NEXT: and a0, a0, a2 +; CHECK-I-NEXT: xori a0, a0, 42 ; CHECK-I-NEXT: ret ; ; CHECK-ZBB-LABEL: out_constant_varx_42: @@ -704,10 +695,9 @@ define i32 @in_constant_varx_42(i32 %x, i32 %y, i32 %mask) { define i32 @out_constant_varx_42_invmask(i32 %x, i32 %y, i32 %mask) { ; CHECK-I-LABEL: out_constant_varx_42_invmask: ; CHECK-I: # %bb.0: -; CHECK-I-NEXT: not a1, a2 -; CHECK-I-NEXT: and a0, a1, a0 -; CHECK-I-NEXT: andi a1, a2, 42 -; CHECK-I-NEXT: or a0, a0, a1 +; CHECK-I-NEXT: xori a1, a0, 42 +; CHECK-I-NEXT: and a1, a1, a2 +; CHECK-I-NEXT: xor a0, a1, a0 ; CHECK-I-NEXT: ret ; ; CHECK-ZBB-LABEL: out_constant_varx_42_invmask: @@ -812,10 +802,9 @@ define i32 @in_constant_mone_vary_invmask(i32 %x, i32 %y, i32 %mask) { define i32 @out_constant_42_vary(i32 %x, i32 %y, i32 %mask) { ; CHECK-I-LABEL: out_constant_42_vary: ; CHECK-I: # %bb.0: -; CHECK-I-NEXT: not a0, a2 -; CHECK-I-NEXT: andi a2, a2, 42 -; CHECK-I-NEXT: and a0, a0, a1 -; CHECK-I-NEXT: or a0, a2, a0 +; CHECK-I-NEXT: xori a0, a1, 42 +; CHECK-I-NEXT: and a0, a0, a2 +; CHECK-I-NEXT: xor a0, a0, a1 ; CHECK-I-NEXT: ret ; ; CHECK-ZBB-LABEL: out_constant_42_vary: @@ -855,10 +844,9 @@ define i32 @in_constant_42_vary(i32 %x, i32 %y, i32 %mask) { define i32 @out_constant_42_vary_invmask(i32 %x, i32 %y, i32 %mask) { ; CHECK-I-LABEL: out_constant_42_vary_invmask: ; CHECK-I: # %bb.0: -; CHECK-I-NEXT: not a0, a2 -; CHECK-I-NEXT: andi a0, a0, 42 -; CHECK-I-NEXT: and a1, a2, a1 -; CHECK-I-NEXT: or a0, a0, a1 +; CHECK-I-NEXT: xori a0, a1, 42 +; CHECK-I-NEXT: and a0, a0, a2 +; CHECK-I-NEXT: xori a0, a0, 42 ; CHECK-I-NEXT: ret ; ; CHECK-ZBB-LABEL: out_constant_42_vary_invmask: diff --git a/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll b/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll new file mode 100644 index 0000000000000..c014345507f69 --- /dev/null +++ b/llvm/test/CodeGen/SystemZ/fold-masked-merge.ll @@ -0,0 +1,277 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 | FileCheck %s --check-prefix=NO-MISC3 +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z15 | FileCheck %s --check-prefix=MISC3 + +; test that masked-merge code is generated as "xor;and;xor" sequence or +; "andn ; and; or" if and-not is available. + +define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) { +; NO-MISC3-LABEL: masked_merge0: +; NO-MISC3: # %bb.0: +; NO-MISC3-NEXT: xr %r3, %r4 +; NO-MISC3-NEXT: nr %r2, %r3 +; NO-MISC3-NEXT: xr %r2, %r4 +; NO-MISC3-NEXT: br %r14 +; +; MISC3-LABEL: masked_merge0: +; MISC3: # %bb.0: +; MISC3-NEXT: nr %r3, %r2 +; MISC3-NEXT: ncrk %r2, %r4, %r2 +; MISC3-NEXT: or %r2, %r3 +; MISC3-NEXT: br %r14 + %and0 = and i32 %a0, %a1 + %not = xor i32 %a0, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %and0, %and1 + ret i32 %or +} + +define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) { +; NO-MISC3-LABEL: masked_merge1: +; NO-MISC3: # %bb.0: +; NO-MISC3-NEXT: xr %r3, %r4 +; NO-MISC3-NEXT: nr %r2, %r3 +; NO-MISC3-NEXT: xr %r2, %r4 +; NO-MISC3-NEXT: br %r14 +; +; MISC3-LABEL: masked_merge1: +; MISC3: # %bb.0: +; MISC3-NEXT: ncrk %r0, %r4, %r2 +; MISC3-NEXT: nr %r2, %r3 +; MISC3-NEXT: or %r2, %r0 +; MISC3-NEXT: br %r14 + %and0 = and i16 %a0, %a1 + %not = xor i16 %a0, -1 + %and1 = and i16 %a2, %not + %or = or i16 %and0, %and1 + ret i16 %or +} + +define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) { +; NO-MISC3-LABEL: masked_merge2: +; NO-MISC3: # %bb.0: +; NO-MISC3-NEXT: lr %r2, %r3 +; NO-MISC3-NEXT: br %r14 +; +; MISC3-LABEL: masked_merge2: +; MISC3: # %bb.0: +; MISC3-NEXT: lr %r2, %r3 +; MISC3-NEXT: br %r14 + %not = xor i8 %a0, -1 + %and0 = and i8 %not, %a1 + %and1 = and i8 %a1, %a0 + %or = or i8 %and0, %and1 + ret i8 %or +} + +define i64 @masked_merge3(i64 %a0, i64 %a1, i64 %a2) { +; NO-MISC3-LABEL: masked_merge3: +; NO-MISC3: # %bb.0: +; NO-MISC3-NEXT: lcgr %r0, %r4 +; NO-MISC3-NEXT: aghi %r0, -1 +; NO-MISC3-NEXT: xgr %r3, %r0 +; NO-MISC3-NEXT: ngr %r3, %r2 +; NO-MISC3-NEXT: xgr %r3, %r2 +; NO-MISC3-NEXT: xgrk %r2, %r3, %r0 +; NO-MISC3-NEXT: br %r14 +; +; MISC3-LABEL: masked_merge3: +; MISC3: # %bb.0: +; MISC3-NEXT: lcgr %r0, %r2 +; MISC3-NEXT: aghi %r0, -1 +; MISC3-NEXT: ncgrk %r0, %r0, %r4 +; MISC3-NEXT: ncgrk %r2, %r2, %r3 +; MISC3-NEXT: ogr %r2, %r0 +; MISC3-NEXT: br %r14 + %v0 = xor i64 %a1, -1 + %v1 = xor i64 %a2, -1 + %not = xor i64 %a0, -1 + %and0 = and i64 %not, %v1 + %and1 = and i64 %v0, %a0 + %or = or i64 %and0, %and1 + ret i64 %or +} + +define i32 @not_a_masked_merge0(i32 %a0, i32 %a1, i32 %a2) { +; NO-MISC3-LABEL: not_a_masked_merge0: +; NO-MISC3: # %bb.0: +; NO-MISC3-NEXT: lcr %r0, %r2 +; NO-MISC3-NEXT: nr %r3, %r2 +; NO-MISC3-NEXT: nr %r0, %r4 +; NO-MISC3-NEXT: ork %r2, %r3, %r0 +; NO-MISC3-NEXT: br %r14 +; +; MISC3-LABEL: not_a_masked_merge0: +; MISC3: # %bb.0: +; MISC3-NEXT: lcr %r0, %r2 +; MISC3-NEXT: nr %r3, %r2 +; MISC3-NEXT: nr %r0, %r4 +; MISC3-NEXT: ork %r2, %r3, %r0 +; MISC3-NEXT: br %r14 + %and0 = and i32 %a0, %a1 + %not_a_not = sub i32 0, %a0 + %and1 = and i32 %not_a_not, %a2 + %or = or i32 %and0, %and1 + ret i32 %or +} + +define i32 @not_a_masked_merge1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) { +; NO-MISC3-LABEL: not_a_masked_merge1: +; NO-MISC3: # %bb.0: +; NO-MISC3-NEXT: xilf %r5, 4294967295 +; NO-MISC3-NEXT: nr %r2, %r3 +; NO-MISC3-NEXT: nr %r4, %r5 +; NO-MISC3-NEXT: or %r2, %r4 +; NO-MISC3-NEXT: br %r14 +; +; MISC3-LABEL: not_a_masked_merge1: +; MISC3: # %bb.0: +; MISC3-NEXT: nr %r2, %r3 +; MISC3-NEXT: ncrk %r0, %r4, %r5 +; MISC3-NEXT: or %r2, %r0 +; MISC3-NEXT: br %r14 + %and0 = and i32 %a0, %a1 + %not = xor i32 %a3, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %and0, %and1 + ret i32 %or +} + +define i32 @not_a_masked_merge2(i32 %a0, i32 %a1, i32 %a2) { +; NO-MISC3-LABEL: not_a_masked_merge2: +; NO-MISC3: # %bb.0: +; NO-MISC3-NEXT: or %r3, %r2 +; NO-MISC3-NEXT: xilf %r2, 4294967295 +; NO-MISC3-NEXT: nr %r2, %r4 +; NO-MISC3-NEXT: or %r2, %r3 +; NO-MISC3-NEXT: br %r14 +; +; MISC3-LABEL: not_a_masked_merge2: +; MISC3: # %bb.0: +; MISC3-NEXT: or %r3, %r2 +; MISC3-NEXT: ncrk %r2, %r4, %r2 +; MISC3-NEXT: or %r2, %r3 +; MISC3-NEXT: br %r14 + %not_an_and0 = or i32 %a0, %a1 + %not = xor i32 %a0, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %not_an_and0, %and1 + ret i32 %or +} + +define i32 @not_a_masked_merge3(i32 %a0, i32 %a1, i32 %a2) { +; NO-MISC3-LABEL: not_a_masked_merge3: +; NO-MISC3: # %bb.0: +; NO-MISC3-NEXT: nr %r3, %r2 +; NO-MISC3-NEXT: xr %r2, %r4 +; NO-MISC3-NEXT: xilf %r2, 4294967295 +; NO-MISC3-NEXT: or %r2, %r3 +; NO-MISC3-NEXT: br %r14 +; +; MISC3-LABEL: not_a_masked_merge3: +; MISC3: # %bb.0: +; MISC3-NEXT: nr %r3, %r2 +; MISC3-NEXT: xr %r2, %r4 +; MISC3-NEXT: ocrk %r2, %r3, %r2 +; MISC3-NEXT: br %r14 + %and0 = and i32 %a0, %a1 + %not = xor i32 %a0, -1 + %not_an_and1 = xor i32 %not, %a2 + %or = or i32 %and0, %not_an_and1 + ret i32 %or +} + +define i32 @not_a_masked_merge4(i32 %a0, i32 %a1, i32 %a2) { +; NO-MISC3-LABEL: not_a_masked_merge4: +; NO-MISC3: # %bb.0: +; NO-MISC3-NEXT: nr %r2, %r3 +; NO-MISC3-NEXT: br %r14 +; +; MISC3-LABEL: not_a_masked_merge4: +; MISC3: # %bb.0: +; MISC3-NEXT: nr %r2, %r3 +; MISC3-NEXT: br %r14 + %and0 = and i32 %a0, %a1 + %not = xor i32 %a2, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %and0, %and1 + ret i32 %or +} + +define i32 @masked_merge_no_transform0(i32 %a0, i32 %a1, i32 %a2, ptr %p1) { +; NO-MISC3-LABEL: masked_merge_no_transform0: +; NO-MISC3: # %bb.0: +; NO-MISC3-NEXT: nr %r3, %r2 +; NO-MISC3-NEXT: xilf %r2, 4294967295 +; NO-MISC3-NEXT: nr %r2, %r4 +; NO-MISC3-NEXT: or %r2, %r3 +; NO-MISC3-NEXT: st %r3, 0(%r5) +; NO-MISC3-NEXT: br %r14 +; +; MISC3-LABEL: masked_merge_no_transform0: +; MISC3: # %bb.0: +; MISC3-NEXT: nr %r3, %r2 +; MISC3-NEXT: ncrk %r2, %r4, %r2 +; MISC3-NEXT: or %r2, %r3 +; MISC3-NEXT: st %r3, 0(%r5) +; MISC3-NEXT: br %r14 + %and0 = and i32 %a0, %a1 + %not = xor i32 %a0, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %and0, %and1 + store i32 %and0, ptr %p1 + ret i32 %or +} + +define i32 @masked_merge_no_transform1(i32 %a0, i32 %a1, i32 %a2, ptr %p1) { +; NO-MISC3-LABEL: masked_merge_no_transform1: +; NO-MISC3: # %bb.0: +; NO-MISC3-NEXT: nrk %r0, %r2, %r3 +; NO-MISC3-NEXT: xilf %r2, 4294967295 +; NO-MISC3-NEXT: nr %r4, %r2 +; NO-MISC3-NEXT: or %r0, %r4 +; NO-MISC3-NEXT: st %r2, 0(%r5) +; NO-MISC3-NEXT: lr %r2, %r0 +; NO-MISC3-NEXT: br %r14 +; +; MISC3-LABEL: masked_merge_no_transform1: +; MISC3: # %bb.0: +; MISC3-NEXT: nrk %r0, %r2, %r3 +; MISC3-NEXT: ncrk %r1, %r4, %r2 +; MISC3-NEXT: xilf %r2, 4294967295 +; MISC3-NEXT: or %r0, %r1 +; MISC3-NEXT: st %r2, 0(%r5) +; MISC3-NEXT: lr %r2, %r0 +; MISC3-NEXT: br %r14 + %and0 = and i32 %a0, %a1 + %not = xor i32 %a0, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %and0, %and1 + store i32 %not, ptr %p1 + ret i32 %or +} + +define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) { +; NO-MISC3-LABEL: masked_merge_no_transform2: +; NO-MISC3: # %bb.0: +; NO-MISC3-NEXT: nr %r3, %r2 +; NO-MISC3-NEXT: xilf %r2, 4294967295 +; NO-MISC3-NEXT: nr %r4, %r2 +; NO-MISC3-NEXT: ork %r2, %r3, %r4 +; NO-MISC3-NEXT: st %r4, 0(%r5) +; NO-MISC3-NEXT: br %r14 +; +; MISC3-LABEL: masked_merge_no_transform2: +; MISC3: # %bb.0: +; MISC3-NEXT: nr %r3, %r2 +; MISC3-NEXT: ncrk %r0, %r4, %r2 +; MISC3-NEXT: ork %r2, %r3, %r0 +; MISC3-NEXT: st %r0, 0(%r5) +; MISC3-NEXT: br %r14 + %and0 = and i32 %a0, %a1 + %not = xor i32 %a0, -1 + %and1 = and i32 %not, %a2 + %or = or i32 %and0, %and1 + store i32 %and1, ptr %p1 + ret i32 %or +} diff --git a/llvm/test/CodeGen/WebAssembly/simd-arith.ll b/llvm/test/CodeGen/WebAssembly/simd-arith.ll index 185c46aa5681e..e3607e12bf530 100644 --- a/llvm/test/CodeGen/WebAssembly/simd-arith.ll +++ b/llvm/test/CodeGen/WebAssembly/simd-arith.ll @@ -4465,203 +4465,139 @@ define <16 x i8> @bitselect_v16i8(<16 x i8> %c, <16 x i8> %v1, <16 x i8> %v2) { ; NO-SIMD128-LABEL: bitselect_v16i8: ; NO-SIMD128: .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: i32.and $push0=, $16, $32 -; NO-SIMD128-NEXT: i32.const $push1=, -1 -; NO-SIMD128-NEXT: i32.xor $push2=, $16, $pop1 -; NO-SIMD128-NEXT: i32.and $push3=, $pop2, $48 -; NO-SIMD128-NEXT: i32.or $push4=, $pop0, $pop3 -; NO-SIMD128-NEXT: i32.store8 15($0), $pop4 -; NO-SIMD128-NEXT: i32.and $push5=, $15, $31 -; NO-SIMD128-NEXT: i32.const $push79=, -1 -; NO-SIMD128-NEXT: i32.xor $push6=, $15, $pop79 -; NO-SIMD128-NEXT: i32.and $push7=, $pop6, $47 -; NO-SIMD128-NEXT: i32.or $push8=, $pop5, $pop7 -; NO-SIMD128-NEXT: i32.store8 14($0), $pop8 -; NO-SIMD128-NEXT: i32.and $push9=, $14, $30 -; NO-SIMD128-NEXT: i32.const $push78=, -1 -; NO-SIMD128-NEXT: i32.xor $push10=, $14, $pop78 -; NO-SIMD128-NEXT: i32.and $push11=, $pop10, $46 -; NO-SIMD128-NEXT: i32.or $push12=, $pop9, $pop11 -; NO-SIMD128-NEXT: i32.store8 13($0), $pop12 -; NO-SIMD128-NEXT: i32.and $push13=, $13, $29 -; NO-SIMD128-NEXT: i32.const $push77=, -1 -; NO-SIMD128-NEXT: i32.xor $push14=, $13, $pop77 -; NO-SIMD128-NEXT: i32.and $push15=, $pop14, $45 -; NO-SIMD128-NEXT: i32.or $push16=, $pop13, $pop15 -; NO-SIMD128-NEXT: i32.store8 12($0), $pop16 -; NO-SIMD128-NEXT: i32.and $push17=, $12, $28 -; NO-SIMD128-NEXT: i32.const $push76=, -1 -; NO-SIMD128-NEXT: i32.xor $push18=, $12, $pop76 -; NO-SIMD128-NEXT: i32.and $push19=, $pop18, $44 -; NO-SIMD128-NEXT: i32.or $push20=, $pop17, $pop19 -; NO-SIMD128-NEXT: i32.store8 11($0), $pop20 -; NO-SIMD128-NEXT: i32.and $push21=, $11, $27 -; NO-SIMD128-NEXT: i32.const $push75=, -1 -; NO-SIMD128-NEXT: i32.xor $push22=, $11, $pop75 -; NO-SIMD128-NEXT: i32.and $push23=, $pop22, $43 -; NO-SIMD128-NEXT: i32.or $push24=, $pop21, $pop23 -; NO-SIMD128-NEXT: i32.store8 10($0), $pop24 -; NO-SIMD128-NEXT: i32.and $push25=, $10, $26 -; NO-SIMD128-NEXT: i32.const $push74=, -1 -; NO-SIMD128-NEXT: i32.xor $push26=, $10, $pop74 -; NO-SIMD128-NEXT: i32.and $push27=, $pop26, $42 -; NO-SIMD128-NEXT: i32.or $push28=, $pop25, $pop27 -; NO-SIMD128-NEXT: i32.store8 9($0), $pop28 -; NO-SIMD128-NEXT: i32.and $push29=, $9, $25 -; NO-SIMD128-NEXT: i32.const $push73=, -1 -; NO-SIMD128-NEXT: i32.xor $push30=, $9, $pop73 -; NO-SIMD128-NEXT: i32.and $push31=, $pop30, $41 -; NO-SIMD128-NEXT: i32.or $push32=, $pop29, $pop31 -; NO-SIMD128-NEXT: i32.store8 8($0), $pop32 -; NO-SIMD128-NEXT: i32.and $push33=, $8, $24 -; NO-SIMD128-NEXT: i32.const $push72=, -1 -; NO-SIMD128-NEXT: i32.xor $push34=, $8, $pop72 -; NO-SIMD128-NEXT: i32.and $push35=, $pop34, $40 -; NO-SIMD128-NEXT: i32.or $push36=, $pop33, $pop35 -; NO-SIMD128-NEXT: i32.store8 7($0), $pop36 -; NO-SIMD128-NEXT: i32.and $push37=, $7, $23 -; NO-SIMD128-NEXT: i32.const $push71=, -1 -; NO-SIMD128-NEXT: i32.xor $push38=, $7, $pop71 -; NO-SIMD128-NEXT: i32.and $push39=, $pop38, $39 -; NO-SIMD128-NEXT: i32.or $push40=, $pop37, $pop39 -; NO-SIMD128-NEXT: i32.store8 6($0), $pop40 -; NO-SIMD128-NEXT: i32.and $push41=, $6, $22 -; NO-SIMD128-NEXT: i32.const $push70=, -1 -; NO-SIMD128-NEXT: i32.xor $push42=, $6, $pop70 -; NO-SIMD128-NEXT: i32.and $push43=, $pop42, $38 -; NO-SIMD128-NEXT: i32.or $push44=, $pop41, $pop43 -; NO-SIMD128-NEXT: i32.store8 5($0), $pop44 -; NO-SIMD128-NEXT: i32.and $push45=, $5, $21 -; NO-SIMD128-NEXT: i32.const $push69=, -1 -; NO-SIMD128-NEXT: i32.xor $push46=, $5, $pop69 -; NO-SIMD128-NEXT: i32.and $push47=, $pop46, $37 -; NO-SIMD128-NEXT: i32.or $push48=, $pop45, $pop47 -; NO-SIMD128-NEXT: i32.store8 4($0), $pop48 -; NO-SIMD128-NEXT: i32.and $push49=, $4, $20 -; NO-SIMD128-NEXT: i32.const $push68=, -1 -; NO-SIMD128-NEXT: i32.xor $push50=, $4, $pop68 -; NO-SIMD128-NEXT: i32.and $push51=, $pop50, $36 -; NO-SIMD128-NEXT: i32.or $push52=, $pop49, $pop51 -; NO-SIMD128-NEXT: i32.store8 3($0), $pop52 -; NO-SIMD128-NEXT: i32.and $push53=, $3, $19 -; NO-SIMD128-NEXT: i32.const $push67=, -1 -; NO-SIMD128-NEXT: i32.xor $push54=, $3, $pop67 -; NO-SIMD128-NEXT: i32.and $push55=, $pop54, $35 -; NO-SIMD128-NEXT: i32.or $push56=, $pop53, $pop55 -; NO-SIMD128-NEXT: i32.store8 2($0), $pop56 -; NO-SIMD128-NEXT: i32.and $push57=, $2, $18 -; NO-SIMD128-NEXT: i32.const $push66=, -1 -; NO-SIMD128-NEXT: i32.xor $push58=, $2, $pop66 -; NO-SIMD128-NEXT: i32.and $push59=, $pop58, $34 -; NO-SIMD128-NEXT: i32.or $push60=, $pop57, $pop59 -; NO-SIMD128-NEXT: i32.store8 1($0), $pop60 -; NO-SIMD128-NEXT: i32.and $push61=, $1, $17 -; NO-SIMD128-NEXT: i32.const $push65=, -1 -; NO-SIMD128-NEXT: i32.xor $push62=, $1, $pop65 -; NO-SIMD128-NEXT: i32.and $push63=, $pop62, $33 -; NO-SIMD128-NEXT: i32.or $push64=, $pop61, $pop63 -; NO-SIMD128-NEXT: i32.store8 0($0), $pop64 +; NO-SIMD128-NEXT: i32.xor $push0=, $32, $48 +; NO-SIMD128-NEXT: i32.and $push1=, $pop0, $16 +; NO-SIMD128-NEXT: i32.xor $push2=, $pop1, $48 +; NO-SIMD128-NEXT: i32.store8 15($0), $pop2 +; NO-SIMD128-NEXT: i32.xor $push3=, $31, $47 +; NO-SIMD128-NEXT: i32.and $push4=, $pop3, $15 +; NO-SIMD128-NEXT: i32.xor $push5=, $pop4, $47 +; NO-SIMD128-NEXT: i32.store8 14($0), $pop5 +; NO-SIMD128-NEXT: i32.xor $push6=, $30, $46 +; NO-SIMD128-NEXT: i32.and $push7=, $pop6, $14 +; NO-SIMD128-NEXT: i32.xor $push8=, $pop7, $46 +; NO-SIMD128-NEXT: i32.store8 13($0), $pop8 +; NO-SIMD128-NEXT: i32.xor $push9=, $29, $45 +; NO-SIMD128-NEXT: i32.and $push10=, $pop9, $13 +; NO-SIMD128-NEXT: i32.xor $push11=, $pop10, $45 +; NO-SIMD128-NEXT: i32.store8 12($0), $pop11 +; NO-SIMD128-NEXT: i32.xor $push12=, $28, $44 +; NO-SIMD128-NEXT: i32.and $push13=, $pop12, $12 +; NO-SIMD128-NEXT: i32.xor $push14=, $pop13, $44 +; NO-SIMD128-NEXT: i32.store8 11($0), $pop14 +; NO-SIMD128-NEXT: i32.xor $push15=, $27, $43 +; NO-SIMD128-NEXT: i32.and $push16=, $pop15, $11 +; NO-SIMD128-NEXT: i32.xor $push17=, $pop16, $43 +; NO-SIMD128-NEXT: i32.store8 10($0), $pop17 +; NO-SIMD128-NEXT: i32.xor $push18=, $26, $42 +; NO-SIMD128-NEXT: i32.and $push19=, $pop18, $10 +; NO-SIMD128-NEXT: i32.xor $push20=, $pop19, $42 +; NO-SIMD128-NEXT: i32.store8 9($0), $pop20 +; NO-SIMD128-NEXT: i32.xor $push21=, $25, $41 +; NO-SIMD128-NEXT: i32.and $push22=, $pop21, $9 +; NO-SIMD128-NEXT: i32.xor $push23=, $pop22, $41 +; NO-SIMD128-NEXT: i32.store8 8($0), $pop23 +; NO-SIMD128-NEXT: i32.xor $push24=, $24, $40 +; NO-SIMD128-NEXT: i32.and $push25=, $pop24, $8 +; NO-SIMD128-NEXT: i32.xor $push26=, $pop25, $40 +; NO-SIMD128-NEXT: i32.store8 7($0), $pop26 +; NO-SIMD128-NEXT: i32.xor $push27=, $23, $39 +; NO-SIMD128-NEXT: i32.and $push28=, $pop27, $7 +; NO-SIMD128-NEXT: i32.xor $push29=, $pop28, $39 +; NO-SIMD128-NEXT: i32.store8 6($0), $pop29 +; NO-SIMD128-NEXT: i32.xor $push30=, $22, $38 +; NO-SIMD128-NEXT: i32.and $push31=, $pop30, $6 +; NO-SIMD128-NEXT: i32.xor $push32=, $pop31, $38 +; NO-SIMD128-NEXT: i32.store8 5($0), $pop32 +; NO-SIMD128-NEXT: i32.xor $push33=, $21, $37 +; NO-SIMD128-NEXT: i32.and $push34=, $pop33, $5 +; NO-SIMD128-NEXT: i32.xor $push35=, $pop34, $37 +; NO-SIMD128-NEXT: i32.store8 4($0), $pop35 +; NO-SIMD128-NEXT: i32.xor $push36=, $20, $36 +; NO-SIMD128-NEXT: i32.and $push37=, $pop36, $4 +; NO-SIMD128-NEXT: i32.xor $push38=, $pop37, $36 +; NO-SIMD128-NEXT: i32.store8 3($0), $pop38 +; NO-SIMD128-NEXT: i32.xor $push39=, $19, $35 +; NO-SIMD128-NEXT: i32.and $push40=, $pop39, $3 +; NO-SIMD128-NEXT: i32.xor $push41=, $pop40, $35 +; NO-SIMD128-NEXT: i32.store8 2($0), $pop41 +; NO-SIMD128-NEXT: i32.xor $push42=, $18, $34 +; NO-SIMD128-NEXT: i32.and $push43=, $pop42, $2 +; NO-SIMD128-NEXT: i32.xor $push44=, $pop43, $34 +; NO-SIMD128-NEXT: i32.store8 1($0), $pop44 +; NO-SIMD128-NEXT: i32.xor $push45=, $17, $33 +; NO-SIMD128-NEXT: i32.and $push46=, $pop45, $1 +; NO-SIMD128-NEXT: i32.xor $push47=, $pop46, $33 +; NO-SIMD128-NEXT: i32.store8 0($0), $pop47 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: bitselect_v16i8: ; NO-SIMD128-FAST: .functype bitselect_v16i8 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: i32.and $push0=, $1, $17 -; NO-SIMD128-FAST-NEXT: i32.const $push1=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push2=, $1, $pop1 -; NO-SIMD128-FAST-NEXT: i32.and $push3=, $pop2, $33 -; NO-SIMD128-FAST-NEXT: i32.or $push4=, $pop0, $pop3 -; NO-SIMD128-FAST-NEXT: i32.store8 0($0), $pop4 -; NO-SIMD128-FAST-NEXT: i32.and $push5=, $2, $18 -; NO-SIMD128-FAST-NEXT: i32.const $push79=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push6=, $2, $pop79 -; NO-SIMD128-FAST-NEXT: i32.and $push7=, $pop6, $34 -; NO-SIMD128-FAST-NEXT: i32.or $push8=, $pop5, $pop7 -; NO-SIMD128-FAST-NEXT: i32.store8 1($0), $pop8 -; NO-SIMD128-FAST-NEXT: i32.and $push9=, $3, $19 -; NO-SIMD128-FAST-NEXT: i32.const $push78=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push10=, $3, $pop78 -; NO-SIMD128-FAST-NEXT: i32.and $push11=, $pop10, $35 -; NO-SIMD128-FAST-NEXT: i32.or $push12=, $pop9, $pop11 -; NO-SIMD128-FAST-NEXT: i32.store8 2($0), $pop12 -; NO-SIMD128-FAST-NEXT: i32.and $push13=, $4, $20 -; NO-SIMD128-FAST-NEXT: i32.const $push77=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push14=, $4, $pop77 -; NO-SIMD128-FAST-NEXT: i32.and $push15=, $pop14, $36 -; NO-SIMD128-FAST-NEXT: i32.or $push16=, $pop13, $pop15 -; NO-SIMD128-FAST-NEXT: i32.store8 3($0), $pop16 -; NO-SIMD128-FAST-NEXT: i32.and $push17=, $5, $21 -; NO-SIMD128-FAST-NEXT: i32.const $push76=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push18=, $5, $pop76 -; NO-SIMD128-FAST-NEXT: i32.and $push19=, $pop18, $37 -; NO-SIMD128-FAST-NEXT: i32.or $push20=, $pop17, $pop19 -; NO-SIMD128-FAST-NEXT: i32.store8 4($0), $pop20 -; NO-SIMD128-FAST-NEXT: i32.and $push21=, $6, $22 -; NO-SIMD128-FAST-NEXT: i32.const $push75=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push22=, $6, $pop75 -; NO-SIMD128-FAST-NEXT: i32.and $push23=, $pop22, $38 -; NO-SIMD128-FAST-NEXT: i32.or $push24=, $pop21, $pop23 -; NO-SIMD128-FAST-NEXT: i32.store8 5($0), $pop24 -; NO-SIMD128-FAST-NEXT: i32.and $push25=, $7, $23 -; NO-SIMD128-FAST-NEXT: i32.const $push74=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push26=, $7, $pop74 -; NO-SIMD128-FAST-NEXT: i32.and $push27=, $pop26, $39 -; NO-SIMD128-FAST-NEXT: i32.or $push28=, $pop25, $pop27 -; NO-SIMD128-FAST-NEXT: i32.store8 6($0), $pop28 -; NO-SIMD128-FAST-NEXT: i32.and $push29=, $8, $24 -; NO-SIMD128-FAST-NEXT: i32.const $push73=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push30=, $8, $pop73 -; NO-SIMD128-FAST-NEXT: i32.and $push31=, $pop30, $40 -; NO-SIMD128-FAST-NEXT: i32.or $push32=, $pop29, $pop31 -; NO-SIMD128-FAST-NEXT: i32.store8 7($0), $pop32 -; NO-SIMD128-FAST-NEXT: i32.and $push33=, $9, $25 -; NO-SIMD128-FAST-NEXT: i32.const $push72=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push34=, $9, $pop72 -; NO-SIMD128-FAST-NEXT: i32.and $push35=, $pop34, $41 -; NO-SIMD128-FAST-NEXT: i32.or $push36=, $pop33, $pop35 -; NO-SIMD128-FAST-NEXT: i32.store8 8($0), $pop36 -; NO-SIMD128-FAST-NEXT: i32.and $push37=, $10, $26 -; NO-SIMD128-FAST-NEXT: i32.const $push71=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push38=, $10, $pop71 -; NO-SIMD128-FAST-NEXT: i32.and $push39=, $pop38, $42 -; NO-SIMD128-FAST-NEXT: i32.or $push40=, $pop37, $pop39 -; NO-SIMD128-FAST-NEXT: i32.store8 9($0), $pop40 -; NO-SIMD128-FAST-NEXT: i32.and $push41=, $11, $27 -; NO-SIMD128-FAST-NEXT: i32.const $push70=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push42=, $11, $pop70 -; NO-SIMD128-FAST-NEXT: i32.and $push43=, $pop42, $43 -; NO-SIMD128-FAST-NEXT: i32.or $push44=, $pop41, $pop43 -; NO-SIMD128-FAST-NEXT: i32.store8 10($0), $pop44 -; NO-SIMD128-FAST-NEXT: i32.and $push45=, $12, $28 -; NO-SIMD128-FAST-NEXT: i32.const $push69=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push46=, $12, $pop69 -; NO-SIMD128-FAST-NEXT: i32.and $push47=, $pop46, $44 -; NO-SIMD128-FAST-NEXT: i32.or $push48=, $pop45, $pop47 -; NO-SIMD128-FAST-NEXT: i32.store8 11($0), $pop48 -; NO-SIMD128-FAST-NEXT: i32.and $push49=, $13, $29 -; NO-SIMD128-FAST-NEXT: i32.const $push68=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push50=, $13, $pop68 -; NO-SIMD128-FAST-NEXT: i32.and $push51=, $pop50, $45 -; NO-SIMD128-FAST-NEXT: i32.or $push52=, $pop49, $pop51 -; NO-SIMD128-FAST-NEXT: i32.store8 12($0), $pop52 -; NO-SIMD128-FAST-NEXT: i32.and $push53=, $14, $30 -; NO-SIMD128-FAST-NEXT: i32.const $push67=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push54=, $14, $pop67 -; NO-SIMD128-FAST-NEXT: i32.and $push55=, $pop54, $46 -; NO-SIMD128-FAST-NEXT: i32.or $push56=, $pop53, $pop55 -; NO-SIMD128-FAST-NEXT: i32.store8 13($0), $pop56 -; NO-SIMD128-FAST-NEXT: i32.and $push57=, $15, $31 -; NO-SIMD128-FAST-NEXT: i32.const $push66=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push58=, $15, $pop66 -; NO-SIMD128-FAST-NEXT: i32.and $push59=, $pop58, $47 -; NO-SIMD128-FAST-NEXT: i32.or $push60=, $pop57, $pop59 -; NO-SIMD128-FAST-NEXT: i32.store8 14($0), $pop60 -; NO-SIMD128-FAST-NEXT: i32.and $push61=, $16, $32 -; NO-SIMD128-FAST-NEXT: i32.const $push65=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push62=, $16, $pop65 -; NO-SIMD128-FAST-NEXT: i32.and $push63=, $pop62, $48 -; NO-SIMD128-FAST-NEXT: i32.or $push64=, $pop61, $pop63 -; NO-SIMD128-FAST-NEXT: i32.store8 15($0), $pop64 +; NO-SIMD128-FAST-NEXT: i32.xor $push0=, $17, $33 +; NO-SIMD128-FAST-NEXT: i32.and $push1=, $pop0, $1 +; NO-SIMD128-FAST-NEXT: i32.xor $push2=, $pop1, $33 +; NO-SIMD128-FAST-NEXT: i32.store8 0($0), $pop2 +; NO-SIMD128-FAST-NEXT: i32.xor $push3=, $18, $34 +; NO-SIMD128-FAST-NEXT: i32.and $push4=, $pop3, $2 +; NO-SIMD128-FAST-NEXT: i32.xor $push5=, $pop4, $34 +; NO-SIMD128-FAST-NEXT: i32.store8 1($0), $pop5 +; NO-SIMD128-FAST-NEXT: i32.xor $push6=, $19, $35 +; NO-SIMD128-FAST-NEXT: i32.and $push7=, $pop6, $3 +; NO-SIMD128-FAST-NEXT: i32.xor $push8=, $pop7, $35 +; NO-SIMD128-FAST-NEXT: i32.store8 2($0), $pop8 +; NO-SIMD128-FAST-NEXT: i32.xor $push9=, $20, $36 +; NO-SIMD128-FAST-NEXT: i32.and $push10=, $pop9, $4 +; NO-SIMD128-FAST-NEXT: i32.xor $push11=, $pop10, $36 +; NO-SIMD128-FAST-NEXT: i32.store8 3($0), $pop11 +; NO-SIMD128-FAST-NEXT: i32.xor $push12=, $21, $37 +; NO-SIMD128-FAST-NEXT: i32.and $push13=, $pop12, $5 +; NO-SIMD128-FAST-NEXT: i32.xor $push14=, $pop13, $37 +; NO-SIMD128-FAST-NEXT: i32.store8 4($0), $pop14 +; NO-SIMD128-FAST-NEXT: i32.xor $push15=, $22, $38 +; NO-SIMD128-FAST-NEXT: i32.and $push16=, $pop15, $6 +; NO-SIMD128-FAST-NEXT: i32.xor $push17=, $pop16, $38 +; NO-SIMD128-FAST-NEXT: i32.store8 5($0), $pop17 +; NO-SIMD128-FAST-NEXT: i32.xor $push18=, $23, $39 +; NO-SIMD128-FAST-NEXT: i32.and $push19=, $pop18, $7 +; NO-SIMD128-FAST-NEXT: i32.xor $push20=, $pop19, $39 +; NO-SIMD128-FAST-NEXT: i32.store8 6($0), $pop20 +; NO-SIMD128-FAST-NEXT: i32.xor $push21=, $24, $40 +; NO-SIMD128-FAST-NEXT: i32.and $push22=, $pop21, $8 +; NO-SIMD128-FAST-NEXT: i32.xor $push23=, $pop22, $40 +; NO-SIMD128-FAST-NEXT: i32.store8 7($0), $pop23 +; NO-SIMD128-FAST-NEXT: i32.xor $push24=, $25, $41 +; NO-SIMD128-FAST-NEXT: i32.and $push25=, $pop24, $9 +; NO-SIMD128-FAST-NEXT: i32.xor $push26=, $pop25, $41 +; NO-SIMD128-FAST-NEXT: i32.store8 8($0), $pop26 +; NO-SIMD128-FAST-NEXT: i32.xor $push27=, $26, $42 +; NO-SIMD128-FAST-NEXT: i32.and $push28=, $pop27, $10 +; NO-SIMD128-FAST-NEXT: i32.xor $push29=, $pop28, $42 +; NO-SIMD128-FAST-NEXT: i32.store8 9($0), $pop29 +; NO-SIMD128-FAST-NEXT: i32.xor $push30=, $27, $43 +; NO-SIMD128-FAST-NEXT: i32.and $push31=, $pop30, $11 +; NO-SIMD128-FAST-NEXT: i32.xor $push32=, $pop31, $43 +; NO-SIMD128-FAST-NEXT: i32.store8 10($0), $pop32 +; NO-SIMD128-FAST-NEXT: i32.xor $push33=, $28, $44 +; NO-SIMD128-FAST-NEXT: i32.and $push34=, $pop33, $12 +; NO-SIMD128-FAST-NEXT: i32.xor $push35=, $pop34, $44 +; NO-SIMD128-FAST-NEXT: i32.store8 11($0), $pop35 +; NO-SIMD128-FAST-NEXT: i32.xor $push36=, $29, $45 +; NO-SIMD128-FAST-NEXT: i32.and $push37=, $pop36, $13 +; NO-SIMD128-FAST-NEXT: i32.xor $push38=, $pop37, $45 +; NO-SIMD128-FAST-NEXT: i32.store8 12($0), $pop38 +; NO-SIMD128-FAST-NEXT: i32.xor $push39=, $30, $46 +; NO-SIMD128-FAST-NEXT: i32.and $push40=, $pop39, $14 +; NO-SIMD128-FAST-NEXT: i32.xor $push41=, $pop40, $46 +; NO-SIMD128-FAST-NEXT: i32.store8 13($0), $pop41 +; NO-SIMD128-FAST-NEXT: i32.xor $push42=, $31, $47 +; NO-SIMD128-FAST-NEXT: i32.and $push43=, $pop42, $15 +; NO-SIMD128-FAST-NEXT: i32.xor $push44=, $pop43, $47 +; NO-SIMD128-FAST-NEXT: i32.store8 14($0), $pop44 +; NO-SIMD128-FAST-NEXT: i32.xor $push45=, $32, $48 +; NO-SIMD128-FAST-NEXT: i32.and $push46=, $pop45, $16 +; NO-SIMD128-FAST-NEXT: i32.xor $push47=, $pop46, $48 +; NO-SIMD128-FAST-NEXT: i32.store8 15($0), $pop47 ; NO-SIMD128-FAST-NEXT: return %masked_v1 = and <16 x i8> %c, %v1 %inv_mask = xor <16 x i8> %c, @@ -7546,107 +7482,75 @@ define <8 x i16> @bitselect_v8i16(<8 x i16> %c, <8 x i16> %v1, <8 x i16> %v2) { ; NO-SIMD128-LABEL: bitselect_v8i16: ; NO-SIMD128: .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: i32.and $push0=, $16, $8 -; NO-SIMD128-NEXT: i32.const $push1=, -1 -; NO-SIMD128-NEXT: i32.xor $push2=, $8, $pop1 -; NO-SIMD128-NEXT: i32.and $push3=, $24, $pop2 -; NO-SIMD128-NEXT: i32.or $push4=, $pop0, $pop3 -; NO-SIMD128-NEXT: i32.store16 14($0), $pop4 -; NO-SIMD128-NEXT: i32.and $push5=, $15, $7 -; NO-SIMD128-NEXT: i32.const $push39=, -1 -; NO-SIMD128-NEXT: i32.xor $push6=, $7, $pop39 -; NO-SIMD128-NEXT: i32.and $push7=, $23, $pop6 -; NO-SIMD128-NEXT: i32.or $push8=, $pop5, $pop7 -; NO-SIMD128-NEXT: i32.store16 12($0), $pop8 -; NO-SIMD128-NEXT: i32.and $push9=, $14, $6 -; NO-SIMD128-NEXT: i32.const $push38=, -1 -; NO-SIMD128-NEXT: i32.xor $push10=, $6, $pop38 -; NO-SIMD128-NEXT: i32.and $push11=, $22, $pop10 -; NO-SIMD128-NEXT: i32.or $push12=, $pop9, $pop11 -; NO-SIMD128-NEXT: i32.store16 10($0), $pop12 -; NO-SIMD128-NEXT: i32.and $push13=, $13, $5 -; NO-SIMD128-NEXT: i32.const $push37=, -1 -; NO-SIMD128-NEXT: i32.xor $push14=, $5, $pop37 -; NO-SIMD128-NEXT: i32.and $push15=, $21, $pop14 -; NO-SIMD128-NEXT: i32.or $push16=, $pop13, $pop15 -; NO-SIMD128-NEXT: i32.store16 8($0), $pop16 -; NO-SIMD128-NEXT: i32.and $push17=, $12, $4 -; NO-SIMD128-NEXT: i32.const $push36=, -1 -; NO-SIMD128-NEXT: i32.xor $push18=, $4, $pop36 -; NO-SIMD128-NEXT: i32.and $push19=, $20, $pop18 -; NO-SIMD128-NEXT: i32.or $push20=, $pop17, $pop19 -; NO-SIMD128-NEXT: i32.store16 6($0), $pop20 -; NO-SIMD128-NEXT: i32.and $push21=, $11, $3 -; NO-SIMD128-NEXT: i32.const $push35=, -1 -; NO-SIMD128-NEXT: i32.xor $push22=, $3, $pop35 -; NO-SIMD128-NEXT: i32.and $push23=, $19, $pop22 -; NO-SIMD128-NEXT: i32.or $push24=, $pop21, $pop23 -; NO-SIMD128-NEXT: i32.store16 4($0), $pop24 -; NO-SIMD128-NEXT: i32.and $push25=, $10, $2 -; NO-SIMD128-NEXT: i32.const $push34=, -1 -; NO-SIMD128-NEXT: i32.xor $push26=, $2, $pop34 -; NO-SIMD128-NEXT: i32.and $push27=, $18, $pop26 -; NO-SIMD128-NEXT: i32.or $push28=, $pop25, $pop27 -; NO-SIMD128-NEXT: i32.store16 2($0), $pop28 -; NO-SIMD128-NEXT: i32.and $push29=, $9, $1 -; NO-SIMD128-NEXT: i32.const $push33=, -1 -; NO-SIMD128-NEXT: i32.xor $push30=, $1, $pop33 -; NO-SIMD128-NEXT: i32.and $push31=, $17, $pop30 -; NO-SIMD128-NEXT: i32.or $push32=, $pop29, $pop31 -; NO-SIMD128-NEXT: i32.store16 0($0), $pop32 +; NO-SIMD128-NEXT: i32.xor $push0=, $16, $24 +; NO-SIMD128-NEXT: i32.and $push1=, $pop0, $8 +; NO-SIMD128-NEXT: i32.xor $push2=, $pop1, $24 +; NO-SIMD128-NEXT: i32.store16 14($0), $pop2 +; NO-SIMD128-NEXT: i32.xor $push3=, $15, $23 +; NO-SIMD128-NEXT: i32.and $push4=, $pop3, $7 +; NO-SIMD128-NEXT: i32.xor $push5=, $pop4, $23 +; NO-SIMD128-NEXT: i32.store16 12($0), $pop5 +; NO-SIMD128-NEXT: i32.xor $push6=, $14, $22 +; NO-SIMD128-NEXT: i32.and $push7=, $pop6, $6 +; NO-SIMD128-NEXT: i32.xor $push8=, $pop7, $22 +; NO-SIMD128-NEXT: i32.store16 10($0), $pop8 +; NO-SIMD128-NEXT: i32.xor $push9=, $13, $21 +; NO-SIMD128-NEXT: i32.and $push10=, $pop9, $5 +; NO-SIMD128-NEXT: i32.xor $push11=, $pop10, $21 +; NO-SIMD128-NEXT: i32.store16 8($0), $pop11 +; NO-SIMD128-NEXT: i32.xor $push12=, $12, $20 +; NO-SIMD128-NEXT: i32.and $push13=, $pop12, $4 +; NO-SIMD128-NEXT: i32.xor $push14=, $pop13, $20 +; NO-SIMD128-NEXT: i32.store16 6($0), $pop14 +; NO-SIMD128-NEXT: i32.xor $push15=, $11, $19 +; NO-SIMD128-NEXT: i32.and $push16=, $pop15, $3 +; NO-SIMD128-NEXT: i32.xor $push17=, $pop16, $19 +; NO-SIMD128-NEXT: i32.store16 4($0), $pop17 +; NO-SIMD128-NEXT: i32.xor $push18=, $10, $18 +; NO-SIMD128-NEXT: i32.and $push19=, $pop18, $2 +; NO-SIMD128-NEXT: i32.xor $push20=, $pop19, $18 +; NO-SIMD128-NEXT: i32.store16 2($0), $pop20 +; NO-SIMD128-NEXT: i32.xor $push21=, $9, $17 +; NO-SIMD128-NEXT: i32.and $push22=, $pop21, $1 +; NO-SIMD128-NEXT: i32.xor $push23=, $pop22, $17 +; NO-SIMD128-NEXT: i32.store16 0($0), $pop23 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: bitselect_v8i16: ; NO-SIMD128-FAST: .functype bitselect_v8i16 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: i32.and $push0=, $9, $1 -; NO-SIMD128-FAST-NEXT: i32.const $push1=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push2=, $1, $pop1 -; NO-SIMD128-FAST-NEXT: i32.and $push3=, $17, $pop2 -; NO-SIMD128-FAST-NEXT: i32.or $push4=, $pop0, $pop3 -; NO-SIMD128-FAST-NEXT: i32.store16 0($0), $pop4 -; NO-SIMD128-FAST-NEXT: i32.and $push5=, $10, $2 -; NO-SIMD128-FAST-NEXT: i32.const $push39=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push6=, $2, $pop39 -; NO-SIMD128-FAST-NEXT: i32.and $push7=, $18, $pop6 -; NO-SIMD128-FAST-NEXT: i32.or $push8=, $pop5, $pop7 -; NO-SIMD128-FAST-NEXT: i32.store16 2($0), $pop8 -; NO-SIMD128-FAST-NEXT: i32.and $push9=, $11, $3 -; NO-SIMD128-FAST-NEXT: i32.const $push38=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push10=, $3, $pop38 -; NO-SIMD128-FAST-NEXT: i32.and $push11=, $19, $pop10 -; NO-SIMD128-FAST-NEXT: i32.or $push12=, $pop9, $pop11 -; NO-SIMD128-FAST-NEXT: i32.store16 4($0), $pop12 -; NO-SIMD128-FAST-NEXT: i32.and $push13=, $12, $4 -; NO-SIMD128-FAST-NEXT: i32.const $push37=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push14=, $4, $pop37 -; NO-SIMD128-FAST-NEXT: i32.and $push15=, $20, $pop14 -; NO-SIMD128-FAST-NEXT: i32.or $push16=, $pop13, $pop15 -; NO-SIMD128-FAST-NEXT: i32.store16 6($0), $pop16 -; NO-SIMD128-FAST-NEXT: i32.and $push17=, $13, $5 -; NO-SIMD128-FAST-NEXT: i32.const $push36=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push18=, $5, $pop36 -; NO-SIMD128-FAST-NEXT: i32.and $push19=, $21, $pop18 -; NO-SIMD128-FAST-NEXT: i32.or $push20=, $pop17, $pop19 -; NO-SIMD128-FAST-NEXT: i32.store16 8($0), $pop20 -; NO-SIMD128-FAST-NEXT: i32.and $push21=, $14, $6 -; NO-SIMD128-FAST-NEXT: i32.const $push35=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push22=, $6, $pop35 -; NO-SIMD128-FAST-NEXT: i32.and $push23=, $22, $pop22 -; NO-SIMD128-FAST-NEXT: i32.or $push24=, $pop21, $pop23 -; NO-SIMD128-FAST-NEXT: i32.store16 10($0), $pop24 -; NO-SIMD128-FAST-NEXT: i32.and $push25=, $15, $7 -; NO-SIMD128-FAST-NEXT: i32.const $push34=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push26=, $7, $pop34 -; NO-SIMD128-FAST-NEXT: i32.and $push27=, $23, $pop26 -; NO-SIMD128-FAST-NEXT: i32.or $push28=, $pop25, $pop27 -; NO-SIMD128-FAST-NEXT: i32.store16 12($0), $pop28 -; NO-SIMD128-FAST-NEXT: i32.and $push29=, $16, $8 -; NO-SIMD128-FAST-NEXT: i32.const $push33=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push30=, $8, $pop33 -; NO-SIMD128-FAST-NEXT: i32.and $push31=, $24, $pop30 -; NO-SIMD128-FAST-NEXT: i32.or $push32=, $pop29, $pop31 -; NO-SIMD128-FAST-NEXT: i32.store16 14($0), $pop32 +; NO-SIMD128-FAST-NEXT: i32.xor $push0=, $9, $17 +; NO-SIMD128-FAST-NEXT: i32.and $push1=, $pop0, $1 +; NO-SIMD128-FAST-NEXT: i32.xor $push2=, $pop1, $17 +; NO-SIMD128-FAST-NEXT: i32.store16 0($0), $pop2 +; NO-SIMD128-FAST-NEXT: i32.xor $push3=, $10, $18 +; NO-SIMD128-FAST-NEXT: i32.and $push4=, $pop3, $2 +; NO-SIMD128-FAST-NEXT: i32.xor $push5=, $pop4, $18 +; NO-SIMD128-FAST-NEXT: i32.store16 2($0), $pop5 +; NO-SIMD128-FAST-NEXT: i32.xor $push6=, $11, $19 +; NO-SIMD128-FAST-NEXT: i32.and $push7=, $pop6, $3 +; NO-SIMD128-FAST-NEXT: i32.xor $push8=, $pop7, $19 +; NO-SIMD128-FAST-NEXT: i32.store16 4($0), $pop8 +; NO-SIMD128-FAST-NEXT: i32.xor $push9=, $12, $20 +; NO-SIMD128-FAST-NEXT: i32.and $push10=, $pop9, $4 +; NO-SIMD128-FAST-NEXT: i32.xor $push11=, $pop10, $20 +; NO-SIMD128-FAST-NEXT: i32.store16 6($0), $pop11 +; NO-SIMD128-FAST-NEXT: i32.xor $push12=, $13, $21 +; NO-SIMD128-FAST-NEXT: i32.and $push13=, $pop12, $5 +; NO-SIMD128-FAST-NEXT: i32.xor $push14=, $pop13, $21 +; NO-SIMD128-FAST-NEXT: i32.store16 8($0), $pop14 +; NO-SIMD128-FAST-NEXT: i32.xor $push15=, $14, $22 +; NO-SIMD128-FAST-NEXT: i32.and $push16=, $pop15, $6 +; NO-SIMD128-FAST-NEXT: i32.xor $push17=, $pop16, $22 +; NO-SIMD128-FAST-NEXT: i32.store16 10($0), $pop17 +; NO-SIMD128-FAST-NEXT: i32.xor $push18=, $15, $23 +; NO-SIMD128-FAST-NEXT: i32.and $push19=, $pop18, $7 +; NO-SIMD128-FAST-NEXT: i32.xor $push20=, $pop19, $23 +; NO-SIMD128-FAST-NEXT: i32.store16 12($0), $pop20 +; NO-SIMD128-FAST-NEXT: i32.xor $push21=, $16, $24 +; NO-SIMD128-FAST-NEXT: i32.and $push22=, $pop21, $8 +; NO-SIMD128-FAST-NEXT: i32.xor $push23=, $pop22, $24 +; NO-SIMD128-FAST-NEXT: i32.store16 14($0), $pop23 ; NO-SIMD128-FAST-NEXT: return %masked_v1 = and <8 x i16> %v1, %c %inv_mask = xor <8 x i16> @@ -9453,59 +9357,43 @@ define <4 x i32> @bitselect_v4i32(<4 x i32> %c, <4 x i32> %v1, <4 x i32> %v2) { ; NO-SIMD128-LABEL: bitselect_v4i32: ; NO-SIMD128: .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: i32.const $push1=, -1 -; NO-SIMD128-NEXT: i32.xor $push2=, $4, $pop1 -; NO-SIMD128-NEXT: i32.and $push3=, $pop2, $12 -; NO-SIMD128-NEXT: i32.and $push0=, $4, $8 -; NO-SIMD128-NEXT: i32.or $push4=, $pop3, $pop0 -; NO-SIMD128-NEXT: i32.store 12($0), $pop4 -; NO-SIMD128-NEXT: i32.const $push19=, -1 -; NO-SIMD128-NEXT: i32.xor $push6=, $3, $pop19 -; NO-SIMD128-NEXT: i32.and $push7=, $pop6, $11 -; NO-SIMD128-NEXT: i32.and $push5=, $3, $7 -; NO-SIMD128-NEXT: i32.or $push8=, $pop7, $pop5 -; NO-SIMD128-NEXT: i32.store 8($0), $pop8 -; NO-SIMD128-NEXT: i32.const $push18=, -1 -; NO-SIMD128-NEXT: i32.xor $push10=, $2, $pop18 -; NO-SIMD128-NEXT: i32.and $push11=, $pop10, $10 -; NO-SIMD128-NEXT: i32.and $push9=, $2, $6 -; NO-SIMD128-NEXT: i32.or $push12=, $pop11, $pop9 -; NO-SIMD128-NEXT: i32.store 4($0), $pop12 -; NO-SIMD128-NEXT: i32.const $push17=, -1 -; NO-SIMD128-NEXT: i32.xor $push14=, $1, $pop17 -; NO-SIMD128-NEXT: i32.and $push15=, $pop14, $9 -; NO-SIMD128-NEXT: i32.and $push13=, $1, $5 -; NO-SIMD128-NEXT: i32.or $push16=, $pop15, $pop13 -; NO-SIMD128-NEXT: i32.store 0($0), $pop16 +; NO-SIMD128-NEXT: i32.xor $push0=, $8, $12 +; NO-SIMD128-NEXT: i32.and $push1=, $pop0, $4 +; NO-SIMD128-NEXT: i32.xor $push2=, $pop1, $12 +; NO-SIMD128-NEXT: i32.store 12($0), $pop2 +; NO-SIMD128-NEXT: i32.xor $push3=, $7, $11 +; NO-SIMD128-NEXT: i32.and $push4=, $pop3, $3 +; NO-SIMD128-NEXT: i32.xor $push5=, $pop4, $11 +; NO-SIMD128-NEXT: i32.store 8($0), $pop5 +; NO-SIMD128-NEXT: i32.xor $push6=, $6, $10 +; NO-SIMD128-NEXT: i32.and $push7=, $pop6, $2 +; NO-SIMD128-NEXT: i32.xor $push8=, $pop7, $10 +; NO-SIMD128-NEXT: i32.store 4($0), $pop8 +; NO-SIMD128-NEXT: i32.xor $push9=, $5, $9 +; NO-SIMD128-NEXT: i32.and $push10=, $pop9, $1 +; NO-SIMD128-NEXT: i32.xor $push11=, $pop10, $9 +; NO-SIMD128-NEXT: i32.store 0($0), $pop11 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: bitselect_v4i32: ; NO-SIMD128-FAST: .functype bitselect_v4i32 (i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: i32.const $push1=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push2=, $1, $pop1 -; NO-SIMD128-FAST-NEXT: i32.and $push3=, $pop2, $9 -; NO-SIMD128-FAST-NEXT: i32.and $push0=, $1, $5 -; NO-SIMD128-FAST-NEXT: i32.or $push4=, $pop3, $pop0 -; NO-SIMD128-FAST-NEXT: i32.store 0($0), $pop4 -; NO-SIMD128-FAST-NEXT: i32.const $push19=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push6=, $2, $pop19 -; NO-SIMD128-FAST-NEXT: i32.and $push7=, $pop6, $10 -; NO-SIMD128-FAST-NEXT: i32.and $push5=, $2, $6 -; NO-SIMD128-FAST-NEXT: i32.or $push8=, $pop7, $pop5 -; NO-SIMD128-FAST-NEXT: i32.store 4($0), $pop8 -; NO-SIMD128-FAST-NEXT: i32.const $push18=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push10=, $3, $pop18 -; NO-SIMD128-FAST-NEXT: i32.and $push11=, $pop10, $11 -; NO-SIMD128-FAST-NEXT: i32.and $push9=, $3, $7 -; NO-SIMD128-FAST-NEXT: i32.or $push12=, $pop11, $pop9 -; NO-SIMD128-FAST-NEXT: i32.store 8($0), $pop12 -; NO-SIMD128-FAST-NEXT: i32.const $push17=, -1 -; NO-SIMD128-FAST-NEXT: i32.xor $push14=, $4, $pop17 -; NO-SIMD128-FAST-NEXT: i32.and $push15=, $pop14, $12 -; NO-SIMD128-FAST-NEXT: i32.and $push13=, $4, $8 -; NO-SIMD128-FAST-NEXT: i32.or $push16=, $pop15, $pop13 -; NO-SIMD128-FAST-NEXT: i32.store 12($0), $pop16 +; NO-SIMD128-FAST-NEXT: i32.xor $push0=, $5, $9 +; NO-SIMD128-FAST-NEXT: i32.and $push1=, $pop0, $1 +; NO-SIMD128-FAST-NEXT: i32.xor $push2=, $pop1, $9 +; NO-SIMD128-FAST-NEXT: i32.store 0($0), $pop2 +; NO-SIMD128-FAST-NEXT: i32.xor $push3=, $6, $10 +; NO-SIMD128-FAST-NEXT: i32.and $push4=, $pop3, $2 +; NO-SIMD128-FAST-NEXT: i32.xor $push5=, $pop4, $10 +; NO-SIMD128-FAST-NEXT: i32.store 4($0), $pop5 +; NO-SIMD128-FAST-NEXT: i32.xor $push6=, $7, $11 +; NO-SIMD128-FAST-NEXT: i32.and $push7=, $pop6, $3 +; NO-SIMD128-FAST-NEXT: i32.xor $push8=, $pop7, $11 +; NO-SIMD128-FAST-NEXT: i32.store 8($0), $pop8 +; NO-SIMD128-FAST-NEXT: i32.xor $push9=, $8, $12 +; NO-SIMD128-FAST-NEXT: i32.and $push10=, $pop9, $4 +; NO-SIMD128-FAST-NEXT: i32.xor $push11=, $pop10, $12 +; NO-SIMD128-FAST-NEXT: i32.store 12($0), $pop11 ; NO-SIMD128-FAST-NEXT: return %masked_v1 = and <4 x i32> %c, %v1 %inv_mask = xor <4 x i32> , %c @@ -10974,35 +10862,27 @@ define <2 x i64> @bitselect_v2i64(<2 x i64> %c, <2 x i64> %v1, <2 x i64> %v2) { ; NO-SIMD128-LABEL: bitselect_v2i64: ; NO-SIMD128: .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> () ; NO-SIMD128-NEXT: # %bb.0: -; NO-SIMD128-NEXT: i64.const $push1=, -1 -; NO-SIMD128-NEXT: i64.xor $push2=, $2, $pop1 -; NO-SIMD128-NEXT: i64.and $push3=, $6, $pop2 -; NO-SIMD128-NEXT: i64.and $push0=, $4, $2 -; NO-SIMD128-NEXT: i64.or $push4=, $pop3, $pop0 -; NO-SIMD128-NEXT: i64.store 8($0), $pop4 -; NO-SIMD128-NEXT: i64.const $push9=, -1 -; NO-SIMD128-NEXT: i64.xor $push6=, $1, $pop9 -; NO-SIMD128-NEXT: i64.and $push7=, $5, $pop6 -; NO-SIMD128-NEXT: i64.and $push5=, $3, $1 -; NO-SIMD128-NEXT: i64.or $push8=, $pop7, $pop5 -; NO-SIMD128-NEXT: i64.store 0($0), $pop8 +; NO-SIMD128-NEXT: i64.xor $push0=, $4, $6 +; NO-SIMD128-NEXT: i64.and $push1=, $pop0, $2 +; NO-SIMD128-NEXT: i64.xor $push2=, $pop1, $6 +; NO-SIMD128-NEXT: i64.store 8($0), $pop2 +; NO-SIMD128-NEXT: i64.xor $push3=, $3, $5 +; NO-SIMD128-NEXT: i64.and $push4=, $pop3, $1 +; NO-SIMD128-NEXT: i64.xor $push5=, $pop4, $5 +; NO-SIMD128-NEXT: i64.store 0($0), $pop5 ; NO-SIMD128-NEXT: return ; ; NO-SIMD128-FAST-LABEL: bitselect_v2i64: ; NO-SIMD128-FAST: .functype bitselect_v2i64 (i32, i64, i64, i64, i64, i64, i64) -> () ; NO-SIMD128-FAST-NEXT: # %bb.0: -; NO-SIMD128-FAST-NEXT: i64.const $push1=, -1 -; NO-SIMD128-FAST-NEXT: i64.xor $push2=, $1, $pop1 -; NO-SIMD128-FAST-NEXT: i64.and $push3=, $5, $pop2 -; NO-SIMD128-FAST-NEXT: i64.and $push0=, $3, $1 -; NO-SIMD128-FAST-NEXT: i64.or $push4=, $pop3, $pop0 -; NO-SIMD128-FAST-NEXT: i64.store 0($0), $pop4 -; NO-SIMD128-FAST-NEXT: i64.const $push9=, -1 -; NO-SIMD128-FAST-NEXT: i64.xor $push6=, $2, $pop9 -; NO-SIMD128-FAST-NEXT: i64.and $push7=, $6, $pop6 -; NO-SIMD128-FAST-NEXT: i64.and $push5=, $4, $2 -; NO-SIMD128-FAST-NEXT: i64.or $push8=, $pop7, $pop5 -; NO-SIMD128-FAST-NEXT: i64.store 8($0), $pop8 +; NO-SIMD128-FAST-NEXT: i64.xor $push0=, $3, $5 +; NO-SIMD128-FAST-NEXT: i64.and $push1=, $pop0, $1 +; NO-SIMD128-FAST-NEXT: i64.xor $push2=, $pop1, $5 +; NO-SIMD128-FAST-NEXT: i64.store 0($0), $pop2 +; NO-SIMD128-FAST-NEXT: i64.xor $push3=, $4, $6 +; NO-SIMD128-FAST-NEXT: i64.and $push4=, $pop3, $2 +; NO-SIMD128-FAST-NEXT: i64.xor $push5=, $pop4, $6 +; NO-SIMD128-FAST-NEXT: i64.store 8($0), $pop5 ; NO-SIMD128-FAST-NEXT: return %masked_v1 = and <2 x i64> %v1, %c %inv_mask = xor <2 x i64> , %c diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll index 2922113b14ea9..4fc0827ac4dd6 100644 --- a/llvm/test/CodeGen/X86/bitselect.ll +++ b/llvm/test/CodeGen/X86/bitselect.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64,X64-NOBMI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64-NOBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64-BMI ; PR46472 ; bitselect(a,b,m) == or(and(a,not(m)),and(b,m)) @@ -17,14 +17,22 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind { ; X86-NEXT: xorb %cl, %al ; X86-NEXT: retl ; -; X64-LABEL: bitselect_i8: -; X64: # %bb.0: -; X64-NEXT: andl %edx, %esi -; X64-NEXT: movl %edx, %eax -; X64-NEXT: notb %al -; X64-NEXT: andb %dil, %al -; X64-NEXT: orb %sil, %al -; X64-NEXT: retq +; X64-NOBMI-LABEL: bitselect_i8: +; X64-NOBMI: # %bb.0: +; X64-NOBMI-NEXT: movl %esi, %eax +; X64-NOBMI-NEXT: xorl %edi, %eax +; X64-NOBMI-NEXT: andl %edx, %eax +; X64-NOBMI-NEXT: xorl %edi, %eax +; X64-NOBMI-NEXT: # kill: def $al killed $al killed $eax +; X64-NOBMI-NEXT: retq +; +; X64-BMI-LABEL: bitselect_i8: +; X64-BMI: # %bb.0: +; X64-BMI-NEXT: andnl %edi, %edx, %eax +; X64-BMI-NEXT: andl %edx, %esi +; X64-BMI-NEXT: orl %esi, %eax +; X64-BMI-NEXT: # kill: def $al killed $al killed $eax +; X64-BMI-NEXT: retq %not = xor i8 %m, -1 %ma = and i8 %a, %not %mb = and i8 %b, %m @@ -35,21 +43,20 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind { define i16 @bitselect_i16(i16 %a, i16 %b, i16 %m) nounwind { ; X86-LABEL: bitselect_i16: ; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: xorw %ax, %cx -; X86-NEXT: andw {{[0-9]+}}(%esp), %cx +; X86-NEXT: xorw %cx, %ax +; X86-NEXT: andw {{[0-9]+}}(%esp), %ax ; X86-NEXT: xorl %ecx, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; ; X64-NOBMI-LABEL: bitselect_i16: ; X64-NOBMI: # %bb.0: -; X64-NOBMI-NEXT: movl %edx, %eax -; X64-NOBMI-NEXT: andl %edx, %esi -; X64-NOBMI-NEXT: notl %eax -; X64-NOBMI-NEXT: andl %edi, %eax -; X64-NOBMI-NEXT: orl %esi, %eax +; X64-NOBMI-NEXT: movl %esi, %eax +; X64-NOBMI-NEXT: xorl %edi, %eax +; X64-NOBMI-NEXT: andl %edx, %eax +; X64-NOBMI-NEXT: xorl %edi, %eax ; X64-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NOBMI-NEXT: retq ; @@ -186,13 +193,12 @@ define i128 @bitselect_i128(i128 %a, i128 %b, i128 %m) nounwind { ; ; X64-BMI-LABEL: bitselect_i128: ; X64-BMI: # %bb.0: -; X64-BMI-NEXT: andnq %rsi, %r9, %rsi ; X64-BMI-NEXT: andnq %rdi, %r8, %rax -; X64-BMI-NEXT: andq %r9, %rcx -; X64-BMI-NEXT: orq %rcx, %rsi ; X64-BMI-NEXT: andq %r8, %rdx ; X64-BMI-NEXT: orq %rdx, %rax -; X64-BMI-NEXT: movq %rsi, %rdx +; X64-BMI-NEXT: andnq %rsi, %r9, %rdx +; X64-BMI-NEXT: andq %r9, %rcx +; X64-BMI-NEXT: orq %rcx, %rdx ; X64-BMI-NEXT: retq %not = xor i128 %m, -1 %ma = and i128 %a, %not diff --git a/llvm/test/CodeGen/X86/fold-masked-merge.ll b/llvm/test/CodeGen/X86/fold-masked-merge.ll index b2614c5fe0493..05e7b2a2de372 100644 --- a/llvm/test/CodeGen/X86/fold-masked-merge.ll +++ b/llvm/test/CodeGen/X86/fold-masked-merge.ll @@ -30,18 +30,17 @@ define i32 @masked_merge0(i32 %a0, i32 %a1, i32 %a2) { define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) { ; NOBMI-LABEL: masked_merge1: ; NOBMI: # %bb.0: -; NOBMI-NEXT: movl %edi, %eax -; NOBMI-NEXT: andl %edi, %esi -; NOBMI-NEXT: notl %eax -; NOBMI-NEXT: andl %edx, %eax -; NOBMI-NEXT: orl %esi, %eax +; NOBMI-NEXT: movl %esi, %eax +; NOBMI-NEXT: xorl %edx, %eax +; NOBMI-NEXT: andl %edi, %eax +; NOBMI-NEXT: xorl %edx, %eax ; NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; NOBMI-NEXT: retq ; ; BMI-LABEL: masked_merge1: ; BMI: # %bb.0: -; BMI-NEXT: andl %edi, %esi ; BMI-NEXT: andnl %edx, %edi, %eax +; BMI-NEXT: andl %edi, %esi ; BMI-NEXT: orl %esi, %eax ; BMI-NEXT: # kill: def $ax killed $ax killed $eax ; BMI-NEXT: retq @@ -53,20 +52,11 @@ define i16 @masked_merge1(i16 %a0, i16 %a1, i16 %a2) { } define i8 @masked_merge2(i8 %a0, i8 %a1, i8 %a2) { -; NOBMI-LABEL: masked_merge2: -; NOBMI: # %bb.0: -; NOBMI-NEXT: movl %esi, %eax -; NOBMI-NEXT: # kill: def $al killed $al killed $eax -; NOBMI-NEXT: retq -; -; BMI-LABEL: masked_merge2: -; BMI: # %bb.0: -; BMI-NEXT: movl %edi, %eax -; BMI-NEXT: notb %al -; BMI-NEXT: andb %sil, %al -; BMI-NEXT: andb %dil, %sil -; BMI-NEXT: orb %sil, %al -; BMI-NEXT: retq +; CHECK-LABEL: masked_merge2: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: retq %not = xor i8 %a0, -1 %and0 = and i8 %not, %a1 %and1 = and i8 %a1, %a0 @@ -279,3 +269,27 @@ define i32 @masked_merge_no_transform2(i32 %a0, i32 %a1, i32 %a2, ptr %p1) { store i32 %and1, ptr %p1 ret i32 %or } + +define i32 @pr137641_crash({ i8, i32 } %0) { +; NOBMI-LABEL: pr137641_crash: +; NOBMI: # %bb.0: +; NOBMI-NEXT: movl %esi, %eax +; NOBMI-NEXT: andl $201, %eax +; NOBMI-NEXT: xorl $1, %eax +; NOBMI-NEXT: retq +; +; BMI-LABEL: pr137641_crash: +; BMI: # %bb.0: +; BMI-NEXT: movl %esi, %eax +; BMI-NEXT: notl %eax +; BMI-NEXT: andl $1, %eax +; BMI-NEXT: andl $200, %esi +; BMI-NEXT: orl %esi, %eax +; BMI-NEXT: retq + %asmresult1.i = extractvalue { i8, i32 } %0, 1 + %not = xor i32 %asmresult1.i, 1 + %and = and i32 1, %not + %and1 = and i32 %asmresult1.i, 200 + %2 = or i32 %and, %and1 + ret i32 %2 +} diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll index 9c9d06921096c..6a55d740fe421 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll @@ -6,21 +6,18 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) { ; CHECK-NOBMI-LABEL: out8: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %edx, %eax -; CHECK-NOBMI-NEXT: andl %edx, %edi -; CHECK-NOBMI-NEXT: notb %al -; CHECK-NOBMI-NEXT: andb %sil, %al -; CHECK-NOBMI-NEXT: orb %dil, %al +; CHECK-NOBMI-NEXT: movl %edi, %eax +; CHECK-NOBMI-NEXT: xorl %esi, %eax +; CHECK-NOBMI-NEXT: andl %edx, %eax +; CHECK-NOBMI-NEXT: xorl %esi, %eax ; CHECK-NOBMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out8: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %edx, %eax +; CHECK-BMI-NEXT: andnl %esi, %edx, %eax ; CHECK-BMI-NEXT: andl %edx, %edi -; CHECK-BMI-NEXT: notb %al -; CHECK-BMI-NEXT: andb %sil, %al -; CHECK-BMI-NEXT: orb %dil, %al +; CHECK-BMI-NEXT: orl %edi, %eax ; CHECK-BMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-BMI-NEXT: retq %mx = and i8 %x, %mask @@ -33,18 +30,17 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) { define i16 @out16(i16 %x, i16 %y, i16 %mask) { ; CHECK-NOBMI-LABEL: out16: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %edx, %eax -; CHECK-NOBMI-NEXT: andl %edx, %edi -; CHECK-NOBMI-NEXT: notl %eax -; CHECK-NOBMI-NEXT: andl %esi, %eax -; CHECK-NOBMI-NEXT: orl %edi, %eax +; CHECK-NOBMI-NEXT: movl %edi, %eax +; CHECK-NOBMI-NEXT: xorl %esi, %eax +; CHECK-NOBMI-NEXT: andl %edx, %eax +; CHECK-NOBMI-NEXT: xorl %esi, %eax ; CHECK-NOBMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out16: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: andl %edx, %edi ; CHECK-BMI-NEXT: andnl %esi, %edx, %eax +; CHECK-BMI-NEXT: andl %edx, %edi ; CHECK-BMI-NEXT: orl %edi, %eax ; CHECK-BMI-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-BMI-NEXT: retq diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll index b1194bedc4e1c..809c15881cc9b 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -16,11 +16,10 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-LABEL: out_v1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: andl %edx, %edi -; CHECK-NEXT: notb %al -; CHECK-NEXT: andb %sil, %al -; CHECK-NEXT: orb %dil, %al +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: andl %edx, %eax +; CHECK-NEXT: xorl %esi, %eax ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %mx = and <1 x i8> %x, %mask @@ -37,32 +36,28 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { ; CHECK-BASELINE-LABEL: out_v2i8: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: movl %r8d, %eax +; CHECK-BASELINE-NEXT: movl %edi, %eax +; CHECK-BASELINE-NEXT: xorl %edx, %eax +; CHECK-BASELINE-NEXT: andl %r8d, %eax +; CHECK-BASELINE-NEXT: xorl %edx, %eax +; CHECK-BASELINE-NEXT: xorl %ecx, %esi ; CHECK-BASELINE-NEXT: andl %r9d, %esi -; CHECK-BASELINE-NEXT: andl %r8d, %edi -; CHECK-BASELINE-NEXT: notb %al -; CHECK-BASELINE-NEXT: notb %r9b -; CHECK-BASELINE-NEXT: andb %cl, %r9b -; CHECK-BASELINE-NEXT: andb %dl, %al -; CHECK-BASELINE-NEXT: orb %dil, %al -; CHECK-BASELINE-NEXT: orb %sil, %r9b +; CHECK-BASELINE-NEXT: xorl %ecx, %esi ; CHECK-BASELINE-NEXT: # kill: def $al killed $al killed $eax -; CHECK-BASELINE-NEXT: movl %r9d, %edx +; CHECK-BASELINE-NEXT: movl %esi, %edx ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v2i8: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movl %r8d, %eax +; CHECK-SSE1-NEXT: movl %edi, %eax +; CHECK-SSE1-NEXT: xorl %edx, %eax +; CHECK-SSE1-NEXT: andl %r8d, %eax +; CHECK-SSE1-NEXT: xorl %edx, %eax +; CHECK-SSE1-NEXT: xorl %ecx, %esi ; CHECK-SSE1-NEXT: andl %r9d, %esi -; CHECK-SSE1-NEXT: andl %r8d, %edi -; CHECK-SSE1-NEXT: notb %al -; CHECK-SSE1-NEXT: notb %r9b -; CHECK-SSE1-NEXT: andb %cl, %r9b -; CHECK-SSE1-NEXT: andb %dl, %al -; CHECK-SSE1-NEXT: orb %dil, %al -; CHECK-SSE1-NEXT: orb %sil, %r9b +; CHECK-SSE1-NEXT: xorl %ecx, %esi ; CHECK-SSE1-NEXT: # kill: def $al killed $al killed $eax -; CHECK-SSE1-NEXT: movl %r9d, %edx +; CHECK-SSE1-NEXT: movl %esi, %edx ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v2i8: @@ -86,11 +81,10 @@ define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { define <1 x i16> @out_v1i16(<1 x i16> %x, <1 x i16> %y, <1 x i16> %mask) nounwind { ; CHECK-LABEL: out_v1i16: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: andl %edx, %edi -; CHECK-NEXT: notl %eax -; CHECK-NEXT: andl %esi, %eax -; CHECK-NEXT: orl %edi, %eax +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: andl %edx, %eax +; CHECK-NEXT: xorl %esi, %eax ; CHECK-NEXT: # kill: def $ax killed $ax killed $eax ; CHECK-NEXT: retq %mx = and <1 x i16> %x, %mask @@ -235,32 +229,28 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi define <2 x i16> @out_v2i16(<2 x i16> %x, <2 x i16> %y, <2 x i16> %mask) nounwind { ; CHECK-BASELINE-LABEL: out_v2i16: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: movl %r8d, %eax +; CHECK-BASELINE-NEXT: movl %edi, %eax +; CHECK-BASELINE-NEXT: xorl %edx, %eax +; CHECK-BASELINE-NEXT: andl %r8d, %eax +; CHECK-BASELINE-NEXT: xorl %edx, %eax +; CHECK-BASELINE-NEXT: xorl %ecx, %esi ; CHECK-BASELINE-NEXT: andl %r9d, %esi -; CHECK-BASELINE-NEXT: andl %r8d, %edi -; CHECK-BASELINE-NEXT: notl %eax -; CHECK-BASELINE-NEXT: notl %r9d -; CHECK-BASELINE-NEXT: andl %ecx, %r9d -; CHECK-BASELINE-NEXT: orl %esi, %r9d -; CHECK-BASELINE-NEXT: andl %edx, %eax -; CHECK-BASELINE-NEXT: orl %edi, %eax +; CHECK-BASELINE-NEXT: xorl %ecx, %esi ; CHECK-BASELINE-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-BASELINE-NEXT: movl %r9d, %edx +; CHECK-BASELINE-NEXT: movl %esi, %edx ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v2i16: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movl %r8d, %eax +; CHECK-SSE1-NEXT: movl %edi, %eax +; CHECK-SSE1-NEXT: xorl %edx, %eax +; CHECK-SSE1-NEXT: andl %r8d, %eax +; CHECK-SSE1-NEXT: xorl %edx, %eax +; CHECK-SSE1-NEXT: xorl %ecx, %esi ; CHECK-SSE1-NEXT: andl %r9d, %esi -; CHECK-SSE1-NEXT: andl %r8d, %edi -; CHECK-SSE1-NEXT: notl %eax -; CHECK-SSE1-NEXT: notl %r9d -; CHECK-SSE1-NEXT: andl %ecx, %r9d -; CHECK-SSE1-NEXT: orl %esi, %r9d -; CHECK-SSE1-NEXT: andl %edx, %eax -; CHECK-SSE1-NEXT: orl %edi, %eax +; CHECK-SSE1-NEXT: xorl %ecx, %esi ; CHECK-SSE1-NEXT: # kill: def $ax killed $ax killed $eax -; CHECK-SSE1-NEXT: movl %r9d, %edx +; CHECK-SSE1-NEXT: movl %esi, %edx ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v2i16: @@ -439,9 +429,12 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin ; CHECK-BASELINE-LABEL: out_v4i16: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: xorl %r11d, %edx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx ; CHECK-BASELINE-NEXT: xorl %r11d, %edx @@ -451,21 +444,21 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin ; CHECK-BASELINE-NEXT: xorl %edi, %r8d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w ; CHECK-BASELINE-NEXT: xorl %edi, %r8d -; CHECK-BASELINE-NEXT: xorl %r9d, %esi -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si -; CHECK-BASELINE-NEXT: xorl %r9d, %esi -; CHECK-BASELINE-NEXT: movw %si, (%rax) ; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) ; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) ; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) +; CHECK-BASELINE-NEXT: movw %si, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i16: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: xorl %r11d, %edx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx ; CHECK-SSE1-NEXT: xorl %r11d, %edx @@ -475,13 +468,10 @@ define <4 x i16> @out_v4i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) nounwin ; CHECK-SSE1-NEXT: xorl %edi, %r8d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w ; CHECK-SSE1-NEXT: xorl %edi, %r8d -; CHECK-SSE1-NEXT: xorl %r9d, %esi -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si -; CHECK-SSE1-NEXT: xorl %r9d, %esi -; CHECK-SSE1-NEXT: movw %si, (%rax) ; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) ; CHECK-SSE1-NEXT: movw %cx, 4(%rax) ; CHECK-SSE1-NEXT: movw %dx, 2(%rax) +; CHECK-SSE1-NEXT: movw %si, (%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i16: @@ -506,43 +496,43 @@ define <4 x i16> @out_v4i16_undef(<4 x i16> %x, <4 x i16> %y, <4 x i16> %mask) n ; CHECK-BASELINE-LABEL: out_v4i16_undef: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: xorl %r10d, %edx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %dx ; CHECK-BASELINE-NEXT: xorl %r10d, %edx ; CHECK-BASELINE-NEXT: xorl %edi, %r8d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r8w ; CHECK-BASELINE-NEXT: xorl %edi, %r8d -; CHECK-BASELINE-NEXT: xorl %r9d, %esi -; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si -; CHECK-BASELINE-NEXT: xorl %r9d, %esi ; CHECK-BASELINE-NEXT: movw %cx, 4(%rax) -; CHECK-BASELINE-NEXT: movw %si, (%rax) ; CHECK-BASELINE-NEXT: movw %r8w, 6(%rax) ; CHECK-BASELINE-NEXT: movw %dx, 2(%rax) +; CHECK-BASELINE-NEXT: movw %si, (%rax) ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v4i16_undef: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %cx +; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si +; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: xorl %r10d, %edx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %dx ; CHECK-SSE1-NEXT: xorl %r10d, %edx ; CHECK-SSE1-NEXT: xorl %edi, %r8d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r8w ; CHECK-SSE1-NEXT: xorl %edi, %r8d -; CHECK-SSE1-NEXT: xorl %r9d, %esi -; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si -; CHECK-SSE1-NEXT: xorl %r9d, %esi ; CHECK-SSE1-NEXT: movw %cx, 4(%rax) -; CHECK-SSE1-NEXT: movw %si, (%rax) ; CHECK-SSE1-NEXT: movw %r8w, 6(%rax) ; CHECK-SSE1-NEXT: movw %dx, 2(%rax) +; CHECK-SSE1-NEXT: movw %si, (%rax) ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v4i16_undef: @@ -883,14 +873,14 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx ; CHECK-BASELINE-NEXT: movq %rdi, %rax -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d -; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %ebp +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r14d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r15d +; CHECK-BASELINE-NEXT: movl {{[0-9]+}}(%rsp), %r12d ; CHECK-BASELINE-NEXT: xorl %r12d, %esi ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %si ; CHECK-BASELINE-NEXT: xorl %r12d, %esi @@ -906,16 +896,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin ; CHECK-BASELINE-NEXT: xorl %ebx, %r9d ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r9w ; CHECK-BASELINE-NEXT: xorl %ebx, %r9d -; CHECK-BASELINE-NEXT: movl %r11d, %ebx -; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %bx +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-BASELINE-NEXT: xorw %r11w, %bx ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %bx ; CHECK-BASELINE-NEXT: xorl %r11d, %ebx -; CHECK-BASELINE-NEXT: movl %r10d, %r11d -; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %r11w +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-BASELINE-NEXT: xorw %r10w, %r11w ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r11w ; CHECK-BASELINE-NEXT: xorl %r10d, %r11d -; CHECK-BASELINE-NEXT: movl %edi, %r10d -; CHECK-BASELINE-NEXT: xorw {{[0-9]+}}(%rsp), %r10w +; CHECK-BASELINE-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-BASELINE-NEXT: xorw %di, %r10w ; CHECK-BASELINE-NEXT: andw {{[0-9]+}}(%rsp), %r10w ; CHECK-BASELINE-NEXT: xorl %edi, %r10d ; CHECK-BASELINE-NEXT: movw %r10w, 14(%rax) @@ -941,14 +931,14 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx ; CHECK-SSE1-NEXT: movq %rdi, %rax -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %edi -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebp -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r14d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r15d -; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r12d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %edi +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %ebp +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r14d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r15d +; CHECK-SSE1-NEXT: movl {{[0-9]+}}(%rsp), %r12d ; CHECK-SSE1-NEXT: xorl %r12d, %esi ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %si ; CHECK-SSE1-NEXT: xorl %r12d, %esi @@ -964,16 +954,16 @@ define <8 x i16> @out_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) nounwin ; CHECK-SSE1-NEXT: xorl %ebx, %r9d ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r9w ; CHECK-SSE1-NEXT: xorl %ebx, %r9d -; CHECK-SSE1-NEXT: movl %r11d, %ebx -; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %bx +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %ebx +; CHECK-SSE1-NEXT: xorw %r11w, %bx ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %bx ; CHECK-SSE1-NEXT: xorl %r11d, %ebx -; CHECK-SSE1-NEXT: movl %r10d, %r11d -; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %r11w +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r11d +; CHECK-SSE1-NEXT: xorw %r10w, %r11w ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r11w ; CHECK-SSE1-NEXT: xorl %r10d, %r11d -; CHECK-SSE1-NEXT: movl %edi, %r10d -; CHECK-SSE1-NEXT: xorw {{[0-9]+}}(%rsp), %r10w +; CHECK-SSE1-NEXT: movzwl {{[0-9]+}}(%rsp), %r10d +; CHECK-SSE1-NEXT: xorw %di, %r10w ; CHECK-SSE1-NEXT: andw {{[0-9]+}}(%rsp), %r10w ; CHECK-SSE1-NEXT: xorl %edi, %r10d ; CHECK-SSE1-NEXT: movw %r10w, 14(%rax) @@ -1759,113 +1749,117 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-BASELINE-NEXT: pushq %r13 ; CHECK-BASELINE-NEXT: pushq %r12 ; CHECK-BASELINE-NEXT: pushq %rbx -; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %r15d -; CHECK-BASELINE-NEXT: movzwl 16(%rdx), %r14d -; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %ebp -; CHECK-BASELINE-NEXT: movzwl 12(%rdx), %ebx -; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r13d -; CHECK-BASELINE-NEXT: movzwl 8(%rdx), %r11d -; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %r10d -; CHECK-BASELINE-NEXT: movzwl 4(%rdx), %r9d -; CHECK-BASELINE-NEXT: movzwl (%rdx), %r8d -; CHECK-BASELINE-NEXT: movzwl 2(%rdx), %r12d -; CHECK-BASELINE-NEXT: movzwl (%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r8w, %ax -; CHECK-BASELINE-NEXT: andw (%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r8d -; CHECK-BASELINE-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 2(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r12w, %ax -; CHECK-BASELINE-NEXT: andw 2(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r12d -; CHECK-BASELINE-NEXT: movzwl 4(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r9w, %ax -; CHECK-BASELINE-NEXT: andw 4(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r9d -; CHECK-BASELINE-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 6(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r10w, %ax -; CHECK-BASELINE-NEXT: andw 6(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r10d -; CHECK-BASELINE-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 8(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r11w, %ax -; CHECK-BASELINE-NEXT: andw 8(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r11d -; CHECK-BASELINE-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 10(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r13w, %ax -; CHECK-BASELINE-NEXT: andw 10(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r13d -; CHECK-BASELINE-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-BASELINE-NEXT: movzwl 12(%rsi), %eax +; CHECK-BASELINE-NEXT: movq %rcx, %r10 +; CHECK-BASELINE-NEXT: movq %rdx, %r8 +; CHECK-BASELINE-NEXT: movq %rsi, %r9 +; CHECK-BASELINE-NEXT: movq %rdi, %r11 +; CHECK-BASELINE-NEXT: movzwl 18(%rdx), %ebp +; CHECK-BASELINE-NEXT: movl 16(%rdx), %r15d +; CHECK-BASELINE-NEXT: movzwl 14(%rdx), %r13d +; CHECK-BASELINE-NEXT: movl 12(%rdx), %r12d +; CHECK-BASELINE-NEXT: movzwl 10(%rdx), %r14d +; CHECK-BASELINE-NEXT: movl 8(%rdx), %ebx +; CHECK-BASELINE-NEXT: movzwl 6(%rdx), %eax +; CHECK-BASELINE-NEXT: movl (%rdx), %ecx +; CHECK-BASELINE-NEXT: movl 4(%rdx), %edx +; CHECK-BASELINE-NEXT: movzwl 2(%r8), %esi +; CHECK-BASELINE-NEXT: movzwl (%r9), %edi +; CHECK-BASELINE-NEXT: xorw %cx, %di +; CHECK-BASELINE-NEXT: andw (%r10), %di +; CHECK-BASELINE-NEXT: xorl %ecx, %edi +; CHECK-BASELINE-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 2(%r9), %ecx +; CHECK-BASELINE-NEXT: xorw %si, %cx +; CHECK-BASELINE-NEXT: andw 2(%r10), %cx +; CHECK-BASELINE-NEXT: xorl %esi, %ecx +; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 4(%r9), %ecx +; CHECK-BASELINE-NEXT: xorw %dx, %cx +; CHECK-BASELINE-NEXT: andw 4(%r10), %cx +; CHECK-BASELINE-NEXT: xorl %edx, %ecx +; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 6(%r9), %ecx +; CHECK-BASELINE-NEXT: xorw %ax, %cx +; CHECK-BASELINE-NEXT: andw 6(%r10), %cx +; CHECK-BASELINE-NEXT: xorl %eax, %ecx +; CHECK-BASELINE-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 8(%r9), %eax ; CHECK-BASELINE-NEXT: xorw %bx, %ax -; CHECK-BASELINE-NEXT: andw 12(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %ebx -; CHECK-BASELINE-NEXT: movzwl 14(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %bp, %ax -; CHECK-BASELINE-NEXT: andw 14(%rcx), %ax +; CHECK-BASELINE-NEXT: andw 8(%r10), %ax +; CHECK-BASELINE-NEXT: xorl %ebx, %eax +; CHECK-BASELINE-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-BASELINE-NEXT: movzwl 10(%r9), %ebx +; CHECK-BASELINE-NEXT: xorw %r14w, %bx +; CHECK-BASELINE-NEXT: andw 10(%r10), %bx +; CHECK-BASELINE-NEXT: xorl %r14d, %ebx +; CHECK-BASELINE-NEXT: movzwl 12(%r9), %r14d +; CHECK-BASELINE-NEXT: xorw %r12w, %r14w +; CHECK-BASELINE-NEXT: andw 12(%r10), %r14w +; CHECK-BASELINE-NEXT: xorl %r12d, %r14d +; CHECK-BASELINE-NEXT: movzwl 14(%r9), %r12d +; CHECK-BASELINE-NEXT: xorw %r13w, %r12w +; CHECK-BASELINE-NEXT: andw 14(%r10), %r12w +; CHECK-BASELINE-NEXT: xorl %r13d, %r12d +; CHECK-BASELINE-NEXT: movzwl 16(%r9), %r13d +; CHECK-BASELINE-NEXT: xorw %r15w, %r13w +; CHECK-BASELINE-NEXT: andw 16(%r10), %r13w +; CHECK-BASELINE-NEXT: xorl %r15d, %r13d +; CHECK-BASELINE-NEXT: movzwl 18(%r9), %r15d +; CHECK-BASELINE-NEXT: xorw %bp, %r15w +; CHECK-BASELINE-NEXT: andw 18(%r10), %r15w +; CHECK-BASELINE-NEXT: xorl %ebp, %r15d +; CHECK-BASELINE-NEXT: movl 20(%r8), %eax +; CHECK-BASELINE-NEXT: movzwl 20(%r9), %ebp +; CHECK-BASELINE-NEXT: xorw %ax, %bp +; CHECK-BASELINE-NEXT: andw 20(%r10), %bp ; CHECK-BASELINE-NEXT: xorl %eax, %ebp -; CHECK-BASELINE-NEXT: movzwl 16(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r14w, %ax -; CHECK-BASELINE-NEXT: andw 16(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r14d -; CHECK-BASELINE-NEXT: movzwl 18(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r15w, %ax -; CHECK-BASELINE-NEXT: andw 18(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r15d -; CHECK-BASELINE-NEXT: movzwl 20(%rdx), %r13d -; CHECK-BASELINE-NEXT: movzwl 20(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r13w, %ax -; CHECK-BASELINE-NEXT: andw 20(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r13d -; CHECK-BASELINE-NEXT: movzwl 22(%rdx), %r9d -; CHECK-BASELINE-NEXT: movzwl 22(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r9w, %ax -; CHECK-BASELINE-NEXT: andw 22(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r9d -; CHECK-BASELINE-NEXT: movzwl 24(%rdx), %r8d -; CHECK-BASELINE-NEXT: movzwl 24(%rsi), %eax -; CHECK-BASELINE-NEXT: xorw %r8w, %ax -; CHECK-BASELINE-NEXT: andw 24(%rcx), %ax -; CHECK-BASELINE-NEXT: xorl %eax, %r8d -; CHECK-BASELINE-NEXT: movzwl 26(%rdx), %eax -; CHECK-BASELINE-NEXT: movzwl 26(%rsi), %r10d -; CHECK-BASELINE-NEXT: xorw %ax, %r10w -; CHECK-BASELINE-NEXT: andw 26(%rcx), %r10w -; CHECK-BASELINE-NEXT: xorl %r10d, %eax -; CHECK-BASELINE-NEXT: movzwl 28(%rdx), %r10d -; CHECK-BASELINE-NEXT: movzwl 28(%rsi), %r11d -; CHECK-BASELINE-NEXT: xorw %r10w, %r11w -; CHECK-BASELINE-NEXT: andw 28(%rcx), %r11w -; CHECK-BASELINE-NEXT: xorl %r11d, %r10d -; CHECK-BASELINE-NEXT: movzwl 30(%rdx), %edx -; CHECK-BASELINE-NEXT: movzwl 30(%rsi), %esi -; CHECK-BASELINE-NEXT: xorw %dx, %si -; CHECK-BASELINE-NEXT: andw 30(%rcx), %si -; CHECK-BASELINE-NEXT: xorl %esi, %edx -; CHECK-BASELINE-NEXT: movw %dx, 30(%rdi) -; CHECK-BASELINE-NEXT: movw %r10w, 28(%rdi) -; CHECK-BASELINE-NEXT: movw %ax, 26(%rdi) -; CHECK-BASELINE-NEXT: movw %r8w, 24(%rdi) -; CHECK-BASELINE-NEXT: movw %r9w, 22(%rdi) -; CHECK-BASELINE-NEXT: movw %r13w, 20(%rdi) -; CHECK-BASELINE-NEXT: movw %r15w, 18(%rdi) -; CHECK-BASELINE-NEXT: movw %r14w, 16(%rdi) -; CHECK-BASELINE-NEXT: movw %bp, 14(%rdi) -; CHECK-BASELINE-NEXT: movw %bx, 12(%rdi) +; CHECK-BASELINE-NEXT: movzwl 22(%r8), %eax +; CHECK-BASELINE-NEXT: movzwl 22(%r9), %esi +; CHECK-BASELINE-NEXT: xorw %ax, %si +; CHECK-BASELINE-NEXT: andw 22(%r10), %si +; CHECK-BASELINE-NEXT: xorl %eax, %esi +; CHECK-BASELINE-NEXT: movl 24(%r8), %eax +; CHECK-BASELINE-NEXT: movzwl 24(%r9), %edx +; CHECK-BASELINE-NEXT: xorw %ax, %dx +; CHECK-BASELINE-NEXT: andw 24(%r10), %dx +; CHECK-BASELINE-NEXT: xorl %eax, %edx +; CHECK-BASELINE-NEXT: movzwl 26(%r8), %eax +; CHECK-BASELINE-NEXT: movzwl 26(%r9), %ecx +; CHECK-BASELINE-NEXT: xorw %ax, %cx +; CHECK-BASELINE-NEXT: andw 26(%r10), %cx +; CHECK-BASELINE-NEXT: xorl %eax, %ecx +; CHECK-BASELINE-NEXT: movl 28(%r8), %edi +; CHECK-BASELINE-NEXT: movzwl 28(%r9), %eax +; CHECK-BASELINE-NEXT: xorw %di, %ax +; CHECK-BASELINE-NEXT: andw 28(%r10), %ax +; CHECK-BASELINE-NEXT: xorl %edi, %eax +; CHECK-BASELINE-NEXT: movzwl 30(%r8), %edi +; CHECK-BASELINE-NEXT: movzwl 30(%r9), %r8d +; CHECK-BASELINE-NEXT: xorw %di, %r8w +; CHECK-BASELINE-NEXT: andw 30(%r10), %r8w +; CHECK-BASELINE-NEXT: xorl %edi, %r8d +; CHECK-BASELINE-NEXT: movw %r8w, 30(%r11) +; CHECK-BASELINE-NEXT: movw %ax, 28(%r11) +; CHECK-BASELINE-NEXT: movw %cx, 26(%r11) +; CHECK-BASELINE-NEXT: movw %dx, 24(%r11) +; CHECK-BASELINE-NEXT: movw %si, 22(%r11) +; CHECK-BASELINE-NEXT: movw %bp, 20(%r11) +; CHECK-BASELINE-NEXT: movw %r15w, 18(%r11) +; CHECK-BASELINE-NEXT: movw %r13w, 16(%r11) +; CHECK-BASELINE-NEXT: movw %r12w, 14(%r11) +; CHECK-BASELINE-NEXT: movw %r14w, 12(%r11) +; CHECK-BASELINE-NEXT: movw %bx, 10(%r11) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 10(%rdi) +; CHECK-BASELINE-NEXT: movw %ax, 8(%r11) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 8(%rdi) +; CHECK-BASELINE-NEXT: movw %ax, 6(%r11) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 6(%rdi) +; CHECK-BASELINE-NEXT: movw %ax, 4(%r11) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, 4(%rdi) -; CHECK-BASELINE-NEXT: movw %r12w, 2(%rdi) +; CHECK-BASELINE-NEXT: movw %ax, 2(%r11) ; CHECK-BASELINE-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-BASELINE-NEXT: movw %ax, (%rdi) -; CHECK-BASELINE-NEXT: movq %rdi, %rax +; CHECK-BASELINE-NEXT: movw %ax, (%r11) +; CHECK-BASELINE-NEXT: movq %r11, %rax ; CHECK-BASELINE-NEXT: popq %rbx ; CHECK-BASELINE-NEXT: popq %r12 ; CHECK-BASELINE-NEXT: popq %r13 @@ -1882,113 +1876,117 @@ define <16 x i16> @out_v16i16(ptr%px, ptr%py, ptr%pmask) nounwind { ; CHECK-SSE1-NEXT: pushq %r13 ; CHECK-SSE1-NEXT: pushq %r12 ; CHECK-SSE1-NEXT: pushq %rbx -; CHECK-SSE1-NEXT: movzwl 18(%rdx), %r15d -; CHECK-SSE1-NEXT: movzwl 16(%rdx), %r14d -; CHECK-SSE1-NEXT: movzwl 14(%rdx), %ebp -; CHECK-SSE1-NEXT: movzwl 12(%rdx), %ebx -; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r13d -; CHECK-SSE1-NEXT: movzwl 8(%rdx), %r11d -; CHECK-SSE1-NEXT: movzwl 6(%rdx), %r10d -; CHECK-SSE1-NEXT: movzwl 4(%rdx), %r9d -; CHECK-SSE1-NEXT: movzwl (%rdx), %r8d -; CHECK-SSE1-NEXT: movzwl 2(%rdx), %r12d -; CHECK-SSE1-NEXT: movzwl (%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r8w, %ax -; CHECK-SSE1-NEXT: andw (%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r8d -; CHECK-SSE1-NEXT: movl %r8d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 2(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r12w, %ax -; CHECK-SSE1-NEXT: andw 2(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r12d -; CHECK-SSE1-NEXT: movzwl 4(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r9w, %ax -; CHECK-SSE1-NEXT: andw 4(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r9d -; CHECK-SSE1-NEXT: movl %r9d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 6(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r10w, %ax -; CHECK-SSE1-NEXT: andw 6(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r10d -; CHECK-SSE1-NEXT: movl %r10d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 8(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r11w, %ax -; CHECK-SSE1-NEXT: andw 8(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r11d -; CHECK-SSE1-NEXT: movl %r11d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 10(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r13w, %ax -; CHECK-SSE1-NEXT: andw 10(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r13d -; CHECK-SSE1-NEXT: movl %r13d, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; CHECK-SSE1-NEXT: movzwl 12(%rsi), %eax +; CHECK-SSE1-NEXT: movq %rcx, %r10 +; CHECK-SSE1-NEXT: movq %rdx, %r8 +; CHECK-SSE1-NEXT: movq %rsi, %r9 +; CHECK-SSE1-NEXT: movq %rdi, %r11 +; CHECK-SSE1-NEXT: movzwl 18(%rdx), %ebp +; CHECK-SSE1-NEXT: movl 16(%rdx), %r15d +; CHECK-SSE1-NEXT: movzwl 14(%rdx), %r13d +; CHECK-SSE1-NEXT: movl 12(%rdx), %r12d +; CHECK-SSE1-NEXT: movzwl 10(%rdx), %r14d +; CHECK-SSE1-NEXT: movl 8(%rdx), %ebx +; CHECK-SSE1-NEXT: movzwl 6(%rdx), %eax +; CHECK-SSE1-NEXT: movl (%rdx), %ecx +; CHECK-SSE1-NEXT: movl 4(%rdx), %edx +; CHECK-SSE1-NEXT: movzwl 2(%r8), %esi +; CHECK-SSE1-NEXT: movzwl (%r9), %edi +; CHECK-SSE1-NEXT: xorw %cx, %di +; CHECK-SSE1-NEXT: andw (%r10), %di +; CHECK-SSE1-NEXT: xorl %ecx, %edi +; CHECK-SSE1-NEXT: movl %edi, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 2(%r9), %ecx +; CHECK-SSE1-NEXT: xorw %si, %cx +; CHECK-SSE1-NEXT: andw 2(%r10), %cx +; CHECK-SSE1-NEXT: xorl %esi, %ecx +; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 4(%r9), %ecx +; CHECK-SSE1-NEXT: xorw %dx, %cx +; CHECK-SSE1-NEXT: andw 4(%r10), %cx +; CHECK-SSE1-NEXT: xorl %edx, %ecx +; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 6(%r9), %ecx +; CHECK-SSE1-NEXT: xorw %ax, %cx +; CHECK-SSE1-NEXT: andw 6(%r10), %cx +; CHECK-SSE1-NEXT: xorl %eax, %ecx +; CHECK-SSE1-NEXT: movl %ecx, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 8(%r9), %eax ; CHECK-SSE1-NEXT: xorw %bx, %ax -; CHECK-SSE1-NEXT: andw 12(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %ebx -; CHECK-SSE1-NEXT: movzwl 14(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %bp, %ax -; CHECK-SSE1-NEXT: andw 14(%rcx), %ax +; CHECK-SSE1-NEXT: andw 8(%r10), %ax +; CHECK-SSE1-NEXT: xorl %ebx, %eax +; CHECK-SSE1-NEXT: movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill +; CHECK-SSE1-NEXT: movzwl 10(%r9), %ebx +; CHECK-SSE1-NEXT: xorw %r14w, %bx +; CHECK-SSE1-NEXT: andw 10(%r10), %bx +; CHECK-SSE1-NEXT: xorl %r14d, %ebx +; CHECK-SSE1-NEXT: movzwl 12(%r9), %r14d +; CHECK-SSE1-NEXT: xorw %r12w, %r14w +; CHECK-SSE1-NEXT: andw 12(%r10), %r14w +; CHECK-SSE1-NEXT: xorl %r12d, %r14d +; CHECK-SSE1-NEXT: movzwl 14(%r9), %r12d +; CHECK-SSE1-NEXT: xorw %r13w, %r12w +; CHECK-SSE1-NEXT: andw 14(%r10), %r12w +; CHECK-SSE1-NEXT: xorl %r13d, %r12d +; CHECK-SSE1-NEXT: movzwl 16(%r9), %r13d +; CHECK-SSE1-NEXT: xorw %r15w, %r13w +; CHECK-SSE1-NEXT: andw 16(%r10), %r13w +; CHECK-SSE1-NEXT: xorl %r15d, %r13d +; CHECK-SSE1-NEXT: movzwl 18(%r9), %r15d +; CHECK-SSE1-NEXT: xorw %bp, %r15w +; CHECK-SSE1-NEXT: andw 18(%r10), %r15w +; CHECK-SSE1-NEXT: xorl %ebp, %r15d +; CHECK-SSE1-NEXT: movl 20(%r8), %eax +; CHECK-SSE1-NEXT: movzwl 20(%r9), %ebp +; CHECK-SSE1-NEXT: xorw %ax, %bp +; CHECK-SSE1-NEXT: andw 20(%r10), %bp ; CHECK-SSE1-NEXT: xorl %eax, %ebp -; CHECK-SSE1-NEXT: movzwl 16(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r14w, %ax -; CHECK-SSE1-NEXT: andw 16(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r14d -; CHECK-SSE1-NEXT: movzwl 18(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r15w, %ax -; CHECK-SSE1-NEXT: andw 18(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r15d -; CHECK-SSE1-NEXT: movzwl 20(%rdx), %r13d -; CHECK-SSE1-NEXT: movzwl 20(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r13w, %ax -; CHECK-SSE1-NEXT: andw 20(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r13d -; CHECK-SSE1-NEXT: movzwl 22(%rdx), %r9d -; CHECK-SSE1-NEXT: movzwl 22(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r9w, %ax -; CHECK-SSE1-NEXT: andw 22(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r9d -; CHECK-SSE1-NEXT: movzwl 24(%rdx), %r8d -; CHECK-SSE1-NEXT: movzwl 24(%rsi), %eax -; CHECK-SSE1-NEXT: xorw %r8w, %ax -; CHECK-SSE1-NEXT: andw 24(%rcx), %ax -; CHECK-SSE1-NEXT: xorl %eax, %r8d -; CHECK-SSE1-NEXT: movzwl 26(%rdx), %eax -; CHECK-SSE1-NEXT: movzwl 26(%rsi), %r10d -; CHECK-SSE1-NEXT: xorw %ax, %r10w -; CHECK-SSE1-NEXT: andw 26(%rcx), %r10w -; CHECK-SSE1-NEXT: xorl %r10d, %eax -; CHECK-SSE1-NEXT: movzwl 28(%rdx), %r10d -; CHECK-SSE1-NEXT: movzwl 28(%rsi), %r11d -; CHECK-SSE1-NEXT: xorw %r10w, %r11w -; CHECK-SSE1-NEXT: andw 28(%rcx), %r11w -; CHECK-SSE1-NEXT: xorl %r11d, %r10d -; CHECK-SSE1-NEXT: movzwl 30(%rdx), %edx -; CHECK-SSE1-NEXT: movzwl 30(%rsi), %esi -; CHECK-SSE1-NEXT: xorw %dx, %si -; CHECK-SSE1-NEXT: andw 30(%rcx), %si -; CHECK-SSE1-NEXT: xorl %esi, %edx -; CHECK-SSE1-NEXT: movw %dx, 30(%rdi) -; CHECK-SSE1-NEXT: movw %r10w, 28(%rdi) -; CHECK-SSE1-NEXT: movw %ax, 26(%rdi) -; CHECK-SSE1-NEXT: movw %r8w, 24(%rdi) -; CHECK-SSE1-NEXT: movw %r9w, 22(%rdi) -; CHECK-SSE1-NEXT: movw %r13w, 20(%rdi) -; CHECK-SSE1-NEXT: movw %r15w, 18(%rdi) -; CHECK-SSE1-NEXT: movw %r14w, 16(%rdi) -; CHECK-SSE1-NEXT: movw %bp, 14(%rdi) -; CHECK-SSE1-NEXT: movw %bx, 12(%rdi) +; CHECK-SSE1-NEXT: movzwl 22(%r8), %eax +; CHECK-SSE1-NEXT: movzwl 22(%r9), %esi +; CHECK-SSE1-NEXT: xorw %ax, %si +; CHECK-SSE1-NEXT: andw 22(%r10), %si +; CHECK-SSE1-NEXT: xorl %eax, %esi +; CHECK-SSE1-NEXT: movl 24(%r8), %eax +; CHECK-SSE1-NEXT: movzwl 24(%r9), %edx +; CHECK-SSE1-NEXT: xorw %ax, %dx +; CHECK-SSE1-NEXT: andw 24(%r10), %dx +; CHECK-SSE1-NEXT: xorl %eax, %edx +; CHECK-SSE1-NEXT: movzwl 26(%r8), %eax +; CHECK-SSE1-NEXT: movzwl 26(%r9), %ecx +; CHECK-SSE1-NEXT: xorw %ax, %cx +; CHECK-SSE1-NEXT: andw 26(%r10), %cx +; CHECK-SSE1-NEXT: xorl %eax, %ecx +; CHECK-SSE1-NEXT: movl 28(%r8), %edi +; CHECK-SSE1-NEXT: movzwl 28(%r9), %eax +; CHECK-SSE1-NEXT: xorw %di, %ax +; CHECK-SSE1-NEXT: andw 28(%r10), %ax +; CHECK-SSE1-NEXT: xorl %edi, %eax +; CHECK-SSE1-NEXT: movzwl 30(%r8), %edi +; CHECK-SSE1-NEXT: movzwl 30(%r9), %r8d +; CHECK-SSE1-NEXT: xorw %di, %r8w +; CHECK-SSE1-NEXT: andw 30(%r10), %r8w +; CHECK-SSE1-NEXT: xorl %edi, %r8d +; CHECK-SSE1-NEXT: movw %r8w, 30(%r11) +; CHECK-SSE1-NEXT: movw %ax, 28(%r11) +; CHECK-SSE1-NEXT: movw %cx, 26(%r11) +; CHECK-SSE1-NEXT: movw %dx, 24(%r11) +; CHECK-SSE1-NEXT: movw %si, 22(%r11) +; CHECK-SSE1-NEXT: movw %bp, 20(%r11) +; CHECK-SSE1-NEXT: movw %r15w, 18(%r11) +; CHECK-SSE1-NEXT: movw %r13w, 16(%r11) +; CHECK-SSE1-NEXT: movw %r12w, 14(%r11) +; CHECK-SSE1-NEXT: movw %r14w, 12(%r11) +; CHECK-SSE1-NEXT: movw %bx, 10(%r11) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 10(%rdi) +; CHECK-SSE1-NEXT: movw %ax, 8(%r11) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 8(%rdi) +; CHECK-SSE1-NEXT: movw %ax, 6(%r11) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 6(%rdi) +; CHECK-SSE1-NEXT: movw %ax, 4(%r11) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, 4(%rdi) -; CHECK-SSE1-NEXT: movw %r12w, 2(%rdi) +; CHECK-SSE1-NEXT: movw %ax, 2(%r11) ; CHECK-SSE1-NEXT: movl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 4-byte Reload -; CHECK-SSE1-NEXT: movw %ax, (%rdi) -; CHECK-SSE1-NEXT: movq %rdi, %rax +; CHECK-SSE1-NEXT: movw %ax, (%r11) +; CHECK-SSE1-NEXT: movq %r11, %rax ; CHECK-SSE1-NEXT: popq %rbx ; CHECK-SSE1-NEXT: popq %r12 ; CHECK-SSE1-NEXT: popq %r13 From 937be177528de156922c1b5f6cab08ba3009dbf2 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Wed, 11 Jun 2025 10:10:22 +0200 Subject: [PATCH 043/851] [flang] Enable delayed localization by default for `do concurrent` (#142567) This PR aims to make it easier and more self-contained to revert the switch/flag if we discover any problems with enabling it by default. --- flang/lib/Lower/Bridge.cpp | 6 +----- flang/test/Lower/do_concurrent_delayed_locality.f90 | 2 +- flang/test/Lower/do_concurrent_local_assoc_entity.f90 | 2 +- flang/test/Lower/do_concurrent_local_default_init.f90 | 2 +- flang/test/Lower/loops.f90 | 2 +- flang/test/Lower/loops3.f90 | 2 +- 6 files changed, 6 insertions(+), 10 deletions(-) diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index 64b16b3abe991..5ff8101dba097 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -2033,11 +2033,7 @@ class FirConverter : public Fortran::lower::AbstractConverter { fir::LocalitySpecifierOperands privateClauseOps; auto doConcurrentLoopOp = mlir::dyn_cast_if_present(info.loopOp); - // TODO Promote to using `enableDelayedPrivatization` (which is enabled by - // default unlike the staging flag) once the implementation of this is more - // complete. - bool useDelayedPriv = - enableDelayedPrivatizationStaging && doConcurrentLoopOp; + bool useDelayedPriv = enableDelayedPrivatization && doConcurrentLoopOp; llvm::SetVector allPrivatizedSymbols; llvm::SmallSet mightHaveReadHostSym; diff --git a/flang/test/Lower/do_concurrent_delayed_locality.f90 b/flang/test/Lower/do_concurrent_delayed_locality.f90 index 6cae0eb46db13..039b17808d19e 100644 --- a/flang/test/Lower/do_concurrent_delayed_locality.f90 +++ b/flang/test/Lower/do_concurrent_delayed_locality.f90 @@ -1,4 +1,4 @@ -! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s subroutine do_concurrent_with_locality_specs implicit none diff --git a/flang/test/Lower/do_concurrent_local_assoc_entity.f90 b/flang/test/Lower/do_concurrent_local_assoc_entity.f90 index a3d0c34ed8569..67f080eb2c1c5 100644 --- a/flang/test/Lower/do_concurrent_local_assoc_entity.f90 +++ b/flang/test/Lower/do_concurrent_local_assoc_entity.f90 @@ -1,4 +1,4 @@ -! RUN: %flang_fc1 -emit-hlfir -mmlir --enable-delayed-privatization-staging=true -o - %s | FileCheck %s +! RUN: %flang_fc1 -emit-hlfir -o - %s | FileCheck %s subroutine local_assoc implicit none diff --git a/flang/test/Lower/do_concurrent_local_default_init.f90 b/flang/test/Lower/do_concurrent_local_default_init.f90 index d643213854744..798cbb335c8c0 100644 --- a/flang/test/Lower/do_concurrent_local_default_init.f90 +++ b/flang/test/Lower/do_concurrent_local_default_init.f90 @@ -1,5 +1,5 @@ ! Test default initialization of DO CONCURRENT LOCAL() entities. -! RUN: bbc -emit-hlfir --enable-delayed-privatization-staging=true -I nowhere -o - %s | FileCheck %s +! RUN: bbc -emit-hlfir -I nowhere -o - %s | FileCheck %s subroutine test_ptr(p) interface diff --git a/flang/test/Lower/loops.f90 b/flang/test/Lower/loops.f90 index 60df27a591dc3..64f14ff972272 100644 --- a/flang/test/Lower/loops.f90 +++ b/flang/test/Lower/loops.f90 @@ -1,4 +1,4 @@ -! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s +! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s ! CHECK-LABEL: loop_test subroutine loop_test diff --git a/flang/test/Lower/loops3.f90 b/flang/test/Lower/loops3.f90 index 84db1972cca16..34d7bcfb7d7ad 100644 --- a/flang/test/Lower/loops3.f90 +++ b/flang/test/Lower/loops3.f90 @@ -1,5 +1,5 @@ ! Test do concurrent reduction -! RUN: bbc -emit-fir -hlfir=false -o - %s | FileCheck %s +! RUN: bbc -emit-fir -hlfir=false --enable-delayed-privatization=false -o - %s | FileCheck %s ! CHECK-LABEL: loop_test subroutine loop_test From afbcf9529a1edb88d067e6fca8d9534901310d5e Mon Sep 17 00:00:00 2001 From: CHANDRA GHALE Date: Wed, 11 Jun 2025 14:01:31 +0530 Subject: [PATCH 044/851] [OpenMP 6.0 ]Codegen for Reduction over private variables with reduction clause (#134709) Codegen support for reduction over private variable with reduction clause. Section 7.6.10 in in OpenMP 6.0 spec. - An internal shared copy is initialized with an initializer value. - The shared copy is updated by combining its value with the values from the private copies created by the clause. - Once an encountering thread verifies that all updates are complete, its original list item is updated by merging its value with that of the shared copy and then broadcast to all threads. Sample Test Case from OpenMP 6.0 Example ``` #include #include #define N 10 void do_red(int n, int *v, int &sum_v) { sum_v = 0; // sum_v is private #pragma omp for reduction(original(private),+: sum_v) for (int i = 0; i < n; i++) { sum_v += v[i]; } } int main(void) { int v[N]; for (int i = 0; i < N; i++) v[i] = i; #pragma omp parallel num_threads(4) { int s_v; // s_v is private do_red(N, v, s_v); assert(s_v == 45); } return 0; } ``` Expected Codegen: ``` // A shared global/static variable is introduced for the reduction result. // This variable is initialized (e.g., using memset or a UDR initializer) // e.g., .omp.reduction.internal_private_var // Barrier before any thread performs combination call void @__kmpc_barrier(...) // Initialization block (executed by thread 0) // e.g., call void @llvm.memset.p0.i64(...) or call @udr_initializer(...) call void @__kmpc_critical(...) // Inside critical section: // Load the current value from the shared variable // Load the thread-local private variable's value // Perform the reduction operation // Store the result back to the shared variable call void @__kmpc_end_critical(...) // Barrier after all threads complete their combinations call void @__kmpc_barrier(...) // Broadcast phase: // Load the final result from the shared variable) // Store the final result to the original private variable in each thread // Final barrier after broadcast call void @__kmpc_barrier(...) ``` --------- Co-authored-by: Chandra Ghale --- clang/docs/OpenMPSupport.rst | 3 +- clang/docs/ReleaseNotes.rst | 1 + clang/lib/CodeGen/CGOpenMPRuntime.cpp | 292 ++++++- clang/lib/CodeGen/CGOpenMPRuntime.h | 12 + clang/lib/CodeGen/CGStmtOpenMP.cpp | 11 +- clang/lib/Sema/SemaOpenMP.cpp | 41 +- .../OpenMP/distribute_simd_misc_messages.c | 3 +- .../OpenMP/for_private_reduction_codegen.cpp | 710 ++++++++++++++++++ clang/test/OpenMP/for_reduction_messages.cpp | 2 + .../OpenMP/for_simd_reduction_messages.cpp | 2 +- .../OpenMP/sections_reduction_messages.cpp | 2 +- .../for/omp_for_private_reduction.cpp | 194 +++++ 12 files changed, 1235 insertions(+), 38 deletions(-) create mode 100644 clang/test/OpenMP/for_private_reduction_codegen.cpp create mode 100644 openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst index d6507071d4693..986aaabe1eed4 100644 --- a/clang/docs/OpenMPSupport.rst +++ b/clang/docs/OpenMPSupport.rst @@ -406,7 +406,8 @@ implementation. +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | Extensions to atomic construct | :none:`unclaimed` | :none:`unclaimed` | | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ -| Private reductions | :part:`partial` | :none:`unclaimed` | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938 | +| Private reductions | :good:`mostly` | :none:`unclaimed` | Parse/Sema:https://github.com/llvm/llvm-project/pull/129938 | +| | | | Codegen: https://github.com/llvm/llvm-project/pull/134709 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ | Self maps | :part:`partial` | :none:`unclaimed` | parsing/sema done: https://github.com/llvm/llvm-project/pull/129888 | +-------------------------------------------------------------+---------------------------+---------------------------+--------------------------------------------------------------------------+ diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index f36c82bff2ef8..5645edc73431b 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1100,6 +1100,7 @@ OpenMP Support open parenthesis. (#GH139665) - An error is now emitted when OpenMP ``collapse`` and ``ordered`` clauses have an argument larger than what can fit within a 64-bit integer. +- Added support for private variable reduction. Improvements ^^^^^^^^^^^^ diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp index 09e3ccc380ae3..4173355491fd4 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp +++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp @@ -4907,11 +4907,255 @@ void CGOpenMPRuntime::emitSingleReductionCombiner(CodeGenFunction &CGF, } } +static std::string generateUniqueName(CodeGenModule &CGM, + llvm::StringRef Prefix, const Expr *Ref); + +void CGOpenMPRuntime::emitPrivateReduction( + CodeGenFunction &CGF, SourceLocation Loc, const Expr *Privates, + const Expr *LHSExprs, const Expr *RHSExprs, const Expr *ReductionOps) { + + // Create a shared global variable (__shared_reduction_var) to accumulate the + // final result. + // + // Call __kmpc_barrier to synchronize threads before initialization. + // + // The master thread (thread_id == 0) initializes __shared_reduction_var + // with the identity value or initializer. + // + // Call __kmpc_barrier to synchronize before combining. + // For each i: + // - Thread enters critical section. + // - Reads its private value from LHSExprs[i]. + // - Updates __shared_reduction_var[i] = RedOp_i(__shared_reduction_var[i], + // Privates[i]). + // - Exits critical section. + // + // Call __kmpc_barrier after combining. + // + // Each thread copies __shared_reduction_var[i] back to RHSExprs[i]. + // + // Final __kmpc_barrier to synchronize after broadcasting + QualType PrivateType = Privates->getType(); + llvm::Type *LLVMType = CGF.ConvertTypeForMem(PrivateType); + + const OMPDeclareReductionDecl *UDR = getReductionInit(ReductionOps); + std::string ReductionVarNameStr; + if (const auto *DRE = dyn_cast(Privates->IgnoreParenCasts())) + ReductionVarNameStr = + generateUniqueName(CGM, DRE->getDecl()->getNameAsString(), Privates); + else + ReductionVarNameStr = "unnamed_priv_var"; + + // Create an internal shared variable + std::string SharedName = + CGM.getOpenMPRuntime().getName({"internal_pivate_", ReductionVarNameStr}); + llvm::GlobalVariable *SharedVar = OMPBuilder.getOrCreateInternalVariable( + LLVMType, ".omp.reduction." + SharedName); + + SharedVar->setAlignment( + llvm::MaybeAlign(CGF.getContext().getTypeAlign(PrivateType) / 8)); + + Address SharedResult = + CGF.MakeNaturalAlignRawAddrLValue(SharedVar, PrivateType).getAddress(); + + llvm::Value *ThreadId = getThreadID(CGF, Loc); + llvm::Value *BarrierLoc = emitUpdateLocation(CGF, Loc, OMP_ATOMIC_REDUCE); + llvm::Value *BarrierArgs[] = {BarrierLoc, ThreadId}; + + llvm::BasicBlock *InitBB = CGF.createBasicBlock("init"); + llvm::BasicBlock *InitEndBB = CGF.createBasicBlock("init.end"); + + llvm::Value *IsWorker = CGF.Builder.CreateICmpEQ( + ThreadId, llvm::ConstantInt::get(ThreadId->getType(), 0)); + CGF.Builder.CreateCondBr(IsWorker, InitBB, InitEndBB); + + CGF.EmitBlock(InitBB); + + auto EmitSharedInit = [&]() { + if (UDR) { // Check if it's a User-Defined Reduction + if (const Expr *UDRInitExpr = UDR->getInitializer()) { + std::pair FnPair = + getUserDefinedReduction(UDR); + llvm::Function *InitializerFn = FnPair.second; + if (InitializerFn) { + if (const auto *CE = + dyn_cast(UDRInitExpr->IgnoreParenImpCasts())) { + const auto *OutDRE = cast( + cast(CE->getArg(0)->IgnoreParenImpCasts()) + ->getSubExpr()); + const VarDecl *OutVD = cast(OutDRE->getDecl()); + + CodeGenFunction::OMPPrivateScope LocalScope(CGF); + LocalScope.addPrivate(OutVD, SharedResult); + + (void)LocalScope.Privatize(); + if (const auto *OVE = dyn_cast( + CE->getCallee()->IgnoreParenImpCasts())) { + CodeGenFunction::OpaqueValueMapping OpaqueMap( + CGF, OVE, RValue::get(InitializerFn)); + CGF.EmitIgnoredExpr(CE); + } else { + CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult, + PrivateType.getQualifiers(), + /*IsInitializer=*/true); + } + } else { + CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult, + PrivateType.getQualifiers(), + /*IsInitializer=*/true); + } + } else { + CGF.EmitAnyExprToMem(UDRInitExpr, SharedResult, + PrivateType.getQualifiers(), + /*IsInitializer=*/true); + } + } else { + // EmitNullInitialization handles default construction for C++ classes + // and zeroing for scalars, which is a reasonable default. + CGF.EmitNullInitialization(SharedResult, PrivateType); + } + return; // UDR initialization handled + } + if (const auto *DRE = dyn_cast(Privates)) { + if (const auto *VD = dyn_cast(DRE->getDecl())) { + if (const Expr *InitExpr = VD->getInit()) { + CGF.EmitAnyExprToMem(InitExpr, SharedResult, + PrivateType.getQualifiers(), true); + return; + } + } + } + CGF.EmitNullInitialization(SharedResult, PrivateType); + }; + EmitSharedInit(); + CGF.Builder.CreateBr(InitEndBB); + CGF.EmitBlock(InitEndBB); + + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_barrier), + BarrierArgs); + + const Expr *ReductionOp = ReductionOps; + const OMPDeclareReductionDecl *CurrentUDR = getReductionInit(ReductionOp); + LValue SharedLV = CGF.MakeAddrLValue(SharedResult, PrivateType); + LValue LHSLV = CGF.EmitLValue(Privates); + + auto EmitCriticalReduction = [&](auto ReductionGen) { + std::string CriticalName = getName({"reduction_critical"}); + emitCriticalRegion(CGF, CriticalName, ReductionGen, Loc); + }; + + if (CurrentUDR) { + // Handle user-defined reduction. + auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + std::pair FnPair = + getUserDefinedReduction(CurrentUDR); + if (FnPair.first) { + if (const auto *CE = dyn_cast(ReductionOp)) { + const auto *OutDRE = cast( + cast(CE->getArg(0)->IgnoreParenImpCasts()) + ->getSubExpr()); + const auto *InDRE = cast( + cast(CE->getArg(1)->IgnoreParenImpCasts()) + ->getSubExpr()); + CodeGenFunction::OMPPrivateScope LocalScope(CGF); + LocalScope.addPrivate(cast(OutDRE->getDecl()), + SharedLV.getAddress()); + LocalScope.addPrivate(cast(InDRE->getDecl()), + LHSLV.getAddress()); + (void)LocalScope.Privatize(); + emitReductionCombiner(CGF, ReductionOp); + } + } + }; + EmitCriticalReduction(ReductionGen); + } else { + // Handle built-in reduction operations. +#ifndef NDEBUG + const Expr *ReductionClauseExpr = ReductionOp->IgnoreParenCasts(); + if (const auto *Cleanup = dyn_cast(ReductionClauseExpr)) + ReductionClauseExpr = Cleanup->getSubExpr()->IgnoreParenCasts(); + + const Expr *AssignRHS = nullptr; + if (const auto *BinOp = dyn_cast(ReductionClauseExpr)) { + if (BinOp->getOpcode() == BO_Assign) + AssignRHS = BinOp->getRHS(); + } else if (const auto *OpCall = + dyn_cast(ReductionClauseExpr)) { + if (OpCall->getOperator() == OO_Equal) + AssignRHS = OpCall->getArg(1); + } + + assert(AssignRHS && + "Private Variable Reduction : Invalid ReductionOp expression"); +#endif + + auto ReductionGen = [&](CodeGenFunction &CGF, PrePostActionTy &Action) { + Action.Enter(CGF); + const auto *OmpOutDRE = + dyn_cast(LHSExprs->IgnoreParenImpCasts()); + const auto *OmpInDRE = + dyn_cast(RHSExprs->IgnoreParenImpCasts()); + assert( + OmpOutDRE && OmpInDRE && + "Private Variable Reduction : LHSExpr/RHSExpr must be DeclRefExprs"); + const VarDecl *OmpOutVD = cast(OmpOutDRE->getDecl()); + const VarDecl *OmpInVD = cast(OmpInDRE->getDecl()); + CodeGenFunction::OMPPrivateScope LocalScope(CGF); + LocalScope.addPrivate(OmpOutVD, SharedLV.getAddress()); + LocalScope.addPrivate(OmpInVD, LHSLV.getAddress()); + (void)LocalScope.Privatize(); + // Emit the actual reduction operation + CGF.EmitIgnoredExpr(ReductionOp); + }; + EmitCriticalReduction(ReductionGen); + } + + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_barrier), + BarrierArgs); + + // Broadcast final result + bool IsAggregate = PrivateType->isAggregateType(); + LValue SharedLV1 = CGF.MakeAddrLValue(SharedResult, PrivateType); + llvm::Value *FinalResultVal = nullptr; + Address FinalResultAddr = Address::invalid(); + + if (IsAggregate) + FinalResultAddr = SharedResult; + else + FinalResultVal = CGF.EmitLoadOfScalar(SharedLV1, Loc); + + LValue TargetLHSLV = CGF.EmitLValue(RHSExprs); + if (IsAggregate) { + CGF.EmitAggregateCopy(TargetLHSLV, + CGF.MakeAddrLValue(FinalResultAddr, PrivateType), + PrivateType, AggValueSlot::DoesNotOverlap, false); + } else { + CGF.EmitStoreOfScalar(FinalResultVal, TargetLHSLV); + } + // Final synchronization barrier + CGF.EmitRuntimeCall(OMPBuilder.getOrCreateRuntimeFunction( + CGM.getModule(), OMPRTL___kmpc_barrier), + BarrierArgs); + + // Combiner with original list item + auto OriginalListCombiner = [&](CodeGenFunction &CGF, + PrePostActionTy &Action) { + Action.Enter(CGF); + emitSingleReductionCombiner(CGF, ReductionOps, Privates, + cast(LHSExprs), + cast(RHSExprs)); + }; + EmitCriticalReduction(OriginalListCombiner); +} + void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, - ArrayRef Privates, - ArrayRef LHSExprs, - ArrayRef RHSExprs, - ArrayRef ReductionOps, + ArrayRef OrgPrivates, + ArrayRef OrgLHSExprs, + ArrayRef OrgRHSExprs, + ArrayRef OrgReductionOps, ReductionOptionsTy Options) { if (!CGF.HaveInsertPoint()) return; @@ -4958,10 +5202,10 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, if (SimpleReduction) { CodeGenFunction::RunCleanupsScope Scope(CGF); - const auto *IPriv = Privates.begin(); - const auto *ILHS = LHSExprs.begin(); - const auto *IRHS = RHSExprs.begin(); - for (const Expr *E : ReductionOps) { + const auto *IPriv = OrgPrivates.begin(); + const auto *ILHS = OrgLHSExprs.begin(); + const auto *IRHS = OrgRHSExprs.begin(); + for (const Expr *E : OrgReductionOps) { emitSingleReductionCombiner(CGF, E, *IPriv, cast(*ILHS), cast(*IRHS)); ++IPriv; @@ -4971,6 +5215,26 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, return; } + // Filter out shared reduction variables based on IsPrivateVarReduction flag. + // Only keep entries where the corresponding variable is not private. + SmallVector FilteredPrivates, FilteredLHSExprs, + FilteredRHSExprs, FilteredReductionOps; + for (unsigned I : llvm::seq( + std::min(OrgReductionOps.size(), OrgLHSExprs.size()))) { + if (!Options.IsPrivateVarReduction[I]) { + FilteredPrivates.emplace_back(OrgPrivates[I]); + FilteredLHSExprs.emplace_back(OrgLHSExprs[I]); + FilteredRHSExprs.emplace_back(OrgRHSExprs[I]); + FilteredReductionOps.emplace_back(OrgReductionOps[I]); + } + } + // Wrap filtered vectors in ArrayRef for downstream shared reduction + // processing. + ArrayRef Privates = FilteredPrivates; + ArrayRef LHSExprs = FilteredLHSExprs; + ArrayRef RHSExprs = FilteredRHSExprs; + ArrayRef ReductionOps = FilteredReductionOps; + // 1. Build a list of reduction variables. // void *RedList[] = {[0], ..., [-1]}; auto Size = RHSExprs.size(); @@ -5162,7 +5426,7 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, } else { // Emit as a critical region. auto &&CritRedGen = [E, Loc](CodeGenFunction &CGF, const Expr *, - const Expr *, const Expr *) { + const Expr *, const Expr *) { CGOpenMPRuntime &RT = CGF.CGM.getOpenMPRuntime(); std::string Name = RT.getName({"atomic_reduction"}); RT.emitCriticalRegion( @@ -5209,6 +5473,16 @@ void CGOpenMPRuntime::emitReduction(CodeGenFunction &CGF, SourceLocation Loc, CGF.EmitBranch(DefaultBB); CGF.EmitBlock(DefaultBB, /*IsFinished=*/true); + assert(OrgLHSExprs.size() == OrgPrivates.size() && + "PrivateVarReduction: Privates size mismatch"); + assert(OrgLHSExprs.size() == OrgReductionOps.size() && + "PrivateVarReduction: ReductionOps size mismatch"); + for (unsigned I : llvm::seq( + std::min(OrgReductionOps.size(), OrgLHSExprs.size()))) { + if (Options.IsPrivateVarReduction[I]) + emitPrivateReduction(CGF, Loc, OrgPrivates[I], OrgLHSExprs[I], + OrgRHSExprs[I], OrgReductionOps[I]); + } } /// Generates unique name for artificial threadprivate variables. diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h index 4321712e1521d..5be48b439f4fd 100644 --- a/clang/lib/CodeGen/CGOpenMPRuntime.h +++ b/clang/lib/CodeGen/CGOpenMPRuntime.h @@ -1201,8 +1201,20 @@ class CGOpenMPRuntime { struct ReductionOptionsTy { bool WithNowait; bool SimpleReduction; + llvm::SmallVector IsPrivateVarReduction; OpenMPDirectiveKind ReductionKind; }; + + /// Emits code for private variable reduction + /// \param Privates List of private copies for original reduction arguments. + /// \param LHSExprs List of LHS in \a ReductionOps reduction operations. + /// \param RHSExprs List of RHS in \a ReductionOps reduction operations. + /// \param ReductionOps List of reduction operations in form 'LHS binop RHS' + /// or 'operator binop(LHS, RHS)'. + void emitPrivateReduction(CodeGenFunction &CGF, SourceLocation Loc, + const Expr *Privates, const Expr *LHSExprs, + const Expr *RHSExprs, const Expr *ReductionOps); + /// Emit a code for reduction clause. Next code should be emitted for /// reduction: /// \code diff --git a/clang/lib/CodeGen/CGStmtOpenMP.cpp b/clang/lib/CodeGen/CGStmtOpenMP.cpp index 7fa6bfa75c350..d9195d749e056 100644 --- a/clang/lib/CodeGen/CGStmtOpenMP.cpp +++ b/clang/lib/CodeGen/CGStmtOpenMP.cpp @@ -1472,6 +1472,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal( llvm::SmallVector LHSExprs; llvm::SmallVector RHSExprs; llvm::SmallVector ReductionOps; + llvm::SmallVector IsPrivateVarReduction; bool HasAtLeastOneReduction = false; bool IsReductionWithTaskMod = false; for (const auto *C : D.getClausesOfKind()) { @@ -1482,6 +1483,8 @@ void CodeGenFunction::EmitOMPReductionClauseFinal( Privates.append(C->privates().begin(), C->privates().end()); LHSExprs.append(C->lhs_exprs().begin(), C->lhs_exprs().end()); RHSExprs.append(C->rhs_exprs().begin(), C->rhs_exprs().end()); + IsPrivateVarReduction.append(C->private_var_reduction_flags().begin(), + C->private_var_reduction_flags().end()); ReductionOps.append(C->reduction_ops().begin(), C->reduction_ops().end()); IsReductionWithTaskMod = IsReductionWithTaskMod || C->getModifier() == OMPC_REDUCTION_task; @@ -1503,7 +1506,7 @@ void CodeGenFunction::EmitOMPReductionClauseFinal( // parallel directive (it always has implicit barrier). CGM.getOpenMPRuntime().emitReduction( *this, D.getEndLoc(), Privates, LHSExprs, RHSExprs, ReductionOps, - {WithNowait, SimpleReduction, ReductionKind}); + {WithNowait, SimpleReduction, IsPrivateVarReduction, ReductionKind}); } } @@ -3944,7 +3947,8 @@ static void emitScanBasedDirective( PrivScope.Privatize(); CGF.CGM.getOpenMPRuntime().emitReduction( CGF, S.getEndLoc(), Privates, LHSs, RHSs, ReductionOps, - {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_unknown}); + {/*WithNowait=*/true, /*SimpleReduction=*/true, + /*IsPrivateVarReduction*/ {}, OMPD_unknown}); } llvm::Value *NextIVal = CGF.Builder.CreateNUWSub(IVal, llvm::ConstantInt::get(CGF.SizeTy, 1)); @@ -5749,7 +5753,8 @@ void CodeGenFunction::EmitOMPScanDirective(const OMPScanDirective &S) { } CGM.getOpenMPRuntime().emitReduction( *this, ParentDir.getEndLoc(), Privates, LHSs, RHSs, ReductionOps, - {/*WithNowait=*/true, /*SimpleReduction=*/true, OMPD_simd}); + {/*WithNowait=*/true, /*SimpleReduction=*/true, + /*IsPrivateVarReduction*/ {}, OMPD_simd}); for (unsigned I = 0, E = CopyArrayElems.size(); I < E; ++I) { const Expr *PrivateExpr = Privates[I]; LValue DestLVal; diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 4ac3a60ae455f..a3395ac157d96 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -19047,34 +19047,14 @@ static bool actOnOMPReductionKindClause( reportOriginalDsa(S, Stack, D, DVar); continue; } - // OpenMP 6.0 [ 7.6.10 ] - // Support Reduction over private variables with reduction clause. - // A list item in a reduction clause can now be private in the enclosing - // context. For orphaned constructs it is assumed to be shared unless the - // original(private) modifier appears in the clause. - DVar = Stack->getImplicitDSA(D, true); - bool IsOrphaned = false; - OpenMPDirectiveKind CurrDir = Stack->getCurrentDirective(); - OpenMPDirectiveKind ParentDir = Stack->getParentDirective(); - // Check if the construct is orphaned (has no enclosing OpenMP context) - IsOrphaned = ParentDir == OMPD_unknown; - // OpenMP 6.0: Private DSA check - IsPrivate = - (S.getLangOpts().OpenMP > 52) && - ((isOpenMPPrivate(DVar.CKind) && DVar.CKind != OMPC_reduction && - isOpenMPWorksharingDirective(CurrDir) && - !isOpenMPParallelDirective(CurrDir) && - !isOpenMPTeamsDirective(CurrDir) && - !isOpenMPSimdDirective(ParentDir)) || - (IsOrphaned && DVar.CKind == OMPC_unknown) || - RD.OrigSharingModifier != OMPC_ORIGINAL_SHARING_shared); // OpenMP [2.14.3.6, Restrictions, p.1] // A list item that appears in a reduction clause of a worksharing // construct must be shared in the parallel regions to which any of the // worksharing regions arising from the worksharing construct bind. - if (!IsPrivate && isOpenMPWorksharingDirective(CurrDir) && + if (S.getLangOpts().OpenMP <= 52 && + isOpenMPWorksharingDirective(CurrDir) && !isOpenMPParallelDirective(CurrDir) && !isOpenMPTeamsDirective(CurrDir)) { DVar = Stack->getImplicitDSA(D, true); @@ -19085,6 +19065,23 @@ static bool actOnOMPReductionKindClause( reportOriginalDsa(S, Stack, D, DVar); continue; } + } else if (isOpenMPWorksharingDirective(CurrDir) && + !isOpenMPParallelDirective(CurrDir) && + !isOpenMPTeamsDirective(CurrDir)) { + // OpenMP 6.0 [ 7.6.10 ] + // Support Reduction over private variables with reduction clause. + // A list item in a reduction clause can now be private in the enclosing + // context. For orphaned constructs it is assumed to be shared unless + // the original(private) modifier appears in the clause. + DVar = Stack->getImplicitDSA(D, true); + // Determine if the variable should be considered private + IsPrivate = DVar.CKind != OMPC_shared; + bool IsOrphaned = false; + OpenMPDirectiveKind ParentDir = Stack->getParentDirective(); + IsOrphaned = ParentDir == OMPD_unknown; + if ((IsOrphaned && + RD.OrigSharingModifier == OMPC_ORIGINAL_SHARING_private)) + IsPrivate = true; } } else { // Threadprivates cannot be shared between threads, so dignose if the base diff --git a/clang/test/OpenMP/distribute_simd_misc_messages.c b/clang/test/OpenMP/distribute_simd_misc_messages.c index 8cbf96cd7a014..270e17dcb89bb 100644 --- a/clang/test/OpenMP/distribute_simd_misc_messages.c +++ b/clang/test/OpenMP/distribute_simd_misc_messages.c @@ -508,6 +508,7 @@ void test_collapse(void) { #pragma omp distribute simd collapse(5 - 5) for (i = 0; i < 16; ++i) ; +#if defined(_OPENMP) && (_OPENMP <= 202111) // expected-note@+3 2 {{defined as reduction}} #pragma omp target #pragma omp teams @@ -520,7 +521,7 @@ void test_collapse(void) { #pragma omp for reduction(+ : i, j) for (int k = 0; k < 16; ++k) i += j; - +#endif #pragma omp target #pragma omp teams for (i = 0; i < 16; ++i) diff --git a/clang/test/OpenMP/for_private_reduction_codegen.cpp b/clang/test/OpenMP/for_private_reduction_codegen.cpp new file mode 100644 index 0000000000000..c8a6863299fb3 --- /dev/null +++ b/clang/test/OpenMP/for_private_reduction_codegen.cpp @@ -0,0 +1,710 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs --replace-value-regex "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --global-value-regex ".omp.reduction..internal[a-zA-Z_0-9.]+" +// RUN: %clang_cc1 -triple x86_64-unknown-unknown -fopenmp -fopenmp-version=60 -x c++ -std=c++17 -emit-llvm %s -o - | FileCheck %s +// expected-no-diagnostics +#define N 10 +class Sum { + int val; + +public: + Sum(int v = 0) : val(v) {} + Sum operator+(const Sum &rhs) const { return Sum(val + rhs.val); } + Sum &operator+=(const Sum &rhs) { + val += rhs.val; + return *this; + } +}; +#pragma omp declare reduction(sum_reduction:Sum : omp_out += omp_in) \ + initializer(omp_priv = Sum(0)) + +void func_red() { + Sum result(0); + Sum array[N]; + + for (int i = 0; i < N; i++) { + array[i] = Sum(i); + } + +#pragma omp parallel private(result) num_threads(4) + { +#pragma omp for reduction(sum_reduction : result) + for (int i = 0; i < N; i++) { + result = result + array[i]; + } + } +} + +void do_red(int n, int *v, int &sum_v) { + sum_v = 0; +#pragma omp for reduction(original(private), + : sum_v) + for (int i = 0; i < n; i++) { + sum_v += v[i]; + } +} +void do_red_extended(int n, int *v, int &sum_v, int &prod_v) { + sum_v = 0; + prod_v = 1; + +#pragma omp for reduction(original(private), + : sum_v) \ + reduction(original(private), * : prod_v) + for (int i = 0; i < n; i++) { + sum_v += v[i]; + prod_v *= v[i]; + } +} +int main(void) { + int v[N]; + for (int i = 0; i < N; i++) + v[i] = i; +#pragma omp parallel num_threads(4) + { + int s_v; + do_red(N, v, s_v); + } + + int sum_v_ext = 0, prod_v_ext = 1; +#pragma omp parallel num_threads(4) + { + do_red_extended(N, v, sum_v_ext, prod_v_ext); + } + return 0; +} + +//. +// CHECK: @.omp.reduction..internal_pivate_.result.result_996 = common global %class.Sum zeroinitializer, align 4 +// CHECK: @.omp.reduction..internal_pivate_.sum_v.sum_v_1188 = common global i32 0, align 4 +// CHECK: @.omp.reduction..internal_pivate_.sum_v.sum_v_1392 = common global i32 0, align 4 +// CHECK: @.omp.reduction..internal_pivate_.prod_v.prod_v_1461 = common global i32 0, align 4 +//. +// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv +// CHECK-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 +// CHECK-NEXT: [[ARRAY:%.*]] = alloca [10 x %class.Sum], align 16 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3:[0-9]+]]) +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0) +// CHECK-NEXT: [[ARRAY_BEGIN:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i32 0, i32 0 +// CHECK-NEXT: [[ARRAYCTOR_END:%.*]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAY_BEGIN]], i64 10 +// CHECK-NEXT: br label [[ARRAYCTOR_LOOP:%.*]] +// CHECK: arrayctor.loop: +// CHECK-NEXT: [[ARRAYCTOR_CUR:%.*]] = phi ptr [ [[ARRAY_BEGIN]], [[ENTRY:%.*]] ], [ [[ARRAYCTOR_NEXT:%.*]], [[ARRAYCTOR_LOOP]] ] +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYCTOR_CUR]], i32 noundef 0) +// CHECK-NEXT: [[ARRAYCTOR_NEXT]] = getelementptr inbounds [[CLASS_SUM]], ptr [[ARRAYCTOR_CUR]], i64 1 +// CHECK-NEXT: [[ARRAYCTOR_DONE:%.*]] = icmp eq ptr [[ARRAYCTOR_NEXT]], [[ARRAYCTOR_END]] +// CHECK-NEXT: br i1 [[ARRAYCTOR_DONE]], label [[ARRAYCTOR_CONT:%.*]], label [[ARRAYCTOR_LOOP]] +// CHECK: arrayctor.cont: +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 +// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[REF_TMP]], i32 noundef [[TMP2]]) +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[ARRAYIDX]], ptr align 4 [[REF_TMP]], i64 4, i1 false) +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP3:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @_Z8func_redv.omp_outlined, ptr [[ARRAY]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_ZN3SumC1Ei +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store i32 [[V]], ptr [[V_ADDR]], align 4 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4 +// CHECK-NEXT: call void @_ZN3SumC2Ei(ptr noundef nonnull align 4 dereferenceable(4) [[THIS1]], i32 noundef [[TMP0]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv.omp_outlined +// CHECK-SAME: (ptr noalias noundef [[DOTGLOBAL_TID_:%.*]], ptr noalias noundef [[DOTBOUND_TID_:%.*]], ptr noundef nonnull align 4 dereferenceable(40) [[ARRAY:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTGLOBAL_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTBOUND_TID__ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[ARRAY_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[RESULT:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[RESULT1:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[CLASS_SUM]], align 4 +// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8 +// CHECK-NEXT: store ptr [[DOTGLOBAL_TID_]], ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: store ptr [[DOTBOUND_TID_]], ptr [[DOTBOUND_TID__ADDR]], align 8 +// CHECK-NEXT: store ptr [[ARRAY]], ptr [[ARRAY_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[ARRAY_ADDR]], align 8 +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT]], i32 noundef 0) +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 9, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: call void @.omp_initializer.(ptr noundef [[RESULT1]], ptr noundef [[RESULT]]) +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[DOTGLOBAL_TID__ADDR]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 4 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1:[0-9]+]], i32 [[TMP2]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp sgt i32 [[TMP3]], 9 +// CHECK-NEXT: br i1 [[CMP]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ 9, [[COND_TRUE]] ], [ [[TMP4]], [[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP5]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP2:%.*]] = icmp sle i32 [[TMP6]], [[TMP7]] +// CHECK-NEXT: br i1 [[CMP2]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP8]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP9]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %class.Sum], ptr [[TMP0]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: [[CALL:%.*]] = call i32 @_ZNK3SumplERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[RESULT1]], ptr noundef nonnull align 4 dereferenceable(4) [[ARRAYIDX]]) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[REF_TMP]], i32 0, i32 0 +// CHECK-NEXT: store i32 [[CALL]], ptr [[COERCE_DIVE]], align 4 +// CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[RESULT1]], ptr align 4 [[REF_TMP]], i64 4, i1 false) +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[ADD3:%.*]] = add nsw i32 [[TMP10]], 1 +// CHECK-NEXT: store i32 [[ADD3]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP11:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2:[0-9]+]], i32 [[TMP2]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z8func_redv.omp_outlined.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP11]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ +// CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK-NEXT: ] +// CHECK: .omp.reduction.case1: +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.case2: +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.default: +// CHECK-NEXT: [[TMP12:%.*]] = icmp eq i32 [[TMP2]], 0 +// CHECK-NEXT: br i1 [[TMP12]], label [[INIT:%.*]], label [[INIT_END:%.*]] +// CHECK: init: +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) @.omp.reduction..internal_pivate_.result.result_996, i32 noundef 0) +// CHECK-NEXT: br label [[INIT_END]] +// CHECK: init.end: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @.omp_combiner.(ptr noundef @.omp.reduction..internal_pivate_.result.result_996, ptr noundef [[RESULT1]]) +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: [[TMP13:%.*]] = load [[CLASS_SUM]], ptr @.omp.reduction..internal_pivate_.result.result_996, align 4 +// CHECK-NEXT: store [[CLASS_SUM]] [[TMP13]], ptr [[RESULT1]], align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP2]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @.omp_combiner.(ptr noundef [[RESULT]], ptr noundef [[RESULT1]]) +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP2]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4:[0-9]+]], i32 [[TMP2]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@.omp_combiner. +// CHECK-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: [[CALL:%.*]] = call noundef nonnull align 4 dereferenceable(4) ptr @_ZN3SumpLERKS_(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], ptr noundef nonnull align 4 dereferenceable(4) [[TMP2]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_ZN3SumpLERKS_ +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[RHS]], ptr [[RHS_ADDR]], align 8 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8 +// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[TMP0]], i32 0, i32 0 +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[VAL]], align 4 +// CHECK-NEXT: [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP2]], [[TMP1]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[VAL2]], align 4 +// CHECK-NEXT: ret ptr [[THIS1]] +// +// +// CHECK-LABEL: define {{[^@]+}}@.omp_initializer. +// CHECK-SAME: (ptr noalias noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[TMP3]], i32 noundef 0) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_ZNK3SumplERKS_ +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[RHS:%.*]]) #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca [[CLASS_SUM:%.*]], align 4 +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[RHS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store ptr [[RHS]], ptr [[RHS_ADDR]], align 8 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[THIS1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[VAL]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[RHS_ADDR]], align 8 +// CHECK-NEXT: [[VAL2:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[TMP1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[VAL2]], align 4 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP0]], [[TMP2]] +// CHECK-NEXT: call void @_ZN3SumC1Ei(ptr noundef nonnull align 4 dereferenceable(4) [[RETVAL]], i32 noundef [[ADD]]) +// CHECK-NEXT: [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[CLASS_SUM]], ptr [[RETVAL]], i32 0, i32 0 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[COERCE_DIVE]], align 4 +// CHECK-NEXT: ret i32 [[TMP3]] +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z8func_redv.omp_outlined.omp.reduction.reduction_func +// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_ZN3SumC2Ei +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(4) [[THIS:%.*]], i32 noundef [[V:%.*]]) unnamed_addr #[[ATTR0]] comdat align 2 { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[THIS_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: store ptr [[THIS]], ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: store i32 [[V]], ptr [[V_ADDR]], align 4 +// CHECK-NEXT: [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8 +// CHECK-NEXT: [[VAL:%.*]] = getelementptr inbounds nuw [[CLASS_SUM:%.*]], ptr [[THIS1]], i32 0, i32 0 +// CHECK-NEXT: [[TMP0:%.*]] = load i32, ptr [[V_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP0]], ptr [[VAL]], align 4 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z6do_rediPiRi +// CHECK-SAME: (i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_2:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SUM_V4:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP5:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[I6:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) +// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP2]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP3]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP4]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[SUB3:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK-NEXT: store i32 [[SUB3]], ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP5]] +// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK: omp.precond.then: +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: store i32 [[TMP6]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK-NEXT: store i32 0, ptr [[SUM_V4]], align 4 +// CHECK-NEXT: store ptr [[SUM_V4]], ptr [[_TMP5]], align 8 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[TMP8]], [[TMP9]] +// CHECK-NEXT: br i1 [[CMP7]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: [[TMP10:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_2]], align 4 +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP10]], [[COND_TRUE]] ], [ [[TMP11]], [[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP12]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP8:%.*]] = icmp sle i32 [[TMP13]], [[TMP14]] +// CHECK-NEXT: br i1 [[CMP8]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP15]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I6]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[I6]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP17]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[_TMP5]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 +// CHECK-NEXT: [[ADD9:%.*]] = add nsw i32 [[TMP20]], [[TMP18]] +// CHECK-NEXT: store i32 [[ADD9]], ptr [[TMP19]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[ADD10:%.*]] = add nsw i32 [[TMP21]], 1 +// CHECK-NEXT: store i32 [[ADD10]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP22:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z6do_rediPiRi.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP22]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ +// CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK-NEXT: ] +// CHECK: .omp.reduction.case1: +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.case2: +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.default: +// CHECK-NEXT: [[TMP23:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[TMP23]], label [[INIT:%.*]], label [[INIT_END:%.*]] +// CHECK: init: +// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4 +// CHECK-NEXT: br label [[INIT_END]] +// CHECK: init.end: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP24:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[SUM_V4]], align 4 +// CHECK-NEXT: [[ADD11:%.*]] = add nsw i32 [[TMP24]], [[TMP25]] +// CHECK-NEXT: store i32 [[ADD11]], ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1188, align 4 +// CHECK-NEXT: store i32 [[TMP26]], ptr [[SUM_V4]], align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP27:%.*]] = load i32, ptr [[TMP7]], align 4 +// CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[SUM_V4]], align 4 +// CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP27]], [[TMP28]] +// CHECK-NEXT: store i32 [[ADD12]], ptr [[TMP7]], align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: br label [[OMP_PRECOND_END]] +// CHECK: omp.precond.end: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z6do_rediPiRi.omp.reduction.reduction_func +// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z15do_red_extendediPiRiS0_ +// CHECK-SAME: (i32 noundef [[N:%.*]], ptr noundef [[V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[SUM_V:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[PROD_V:%.*]]) #[[ATTR0]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[N_ADDR:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[SUM_V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[PROD_V_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[TMP:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[_TMP1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTOMP_IV:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP2:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTCAPTURE_EXPR_3:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_LB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_UB:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_STRIDE:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_IS_LAST:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SUM_V5:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP6:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[PROD_V7:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[_TMP8:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[I9:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[DOTOMP_REDUCTION_RED_LIST:%.*]] = alloca [0 x ptr], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) +// CHECK-NEXT: store i32 [[N]], ptr [[N_ADDR]], align 4 +// CHECK-NEXT: store ptr [[V]], ptr [[V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[SUM_V]], ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[PROD_V]], ptr [[PROD_V_ADDR]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store i32 0, ptr [[TMP1]], align 4 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[PROD_V_ADDR]], align 8 +// CHECK-NEXT: store i32 1, ptr [[TMP2]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[SUM_V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[TMP]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = load ptr, ptr [[PROD_V_ADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP4]], ptr [[_TMP1]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = load i32, ptr [[N_ADDR]], align 4 +// CHECK-NEXT: store i32 [[TMP5]], ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[SUB:%.*]] = sub nsw i32 [[TMP6]], 0 +// CHECK-NEXT: [[DIV:%.*]] = sdiv i32 [[SUB]], 1 +// CHECK-NEXT: [[SUB4:%.*]] = sub nsw i32 [[DIV]], 1 +// CHECK-NEXT: store i32 [[SUB4]], ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 0, [[TMP7]] +// CHECK-NEXT: br i1 [[CMP]], label [[OMP_PRECOND_THEN:%.*]], label [[OMP_PRECOND_END:%.*]] +// CHECK: omp.precond.then: +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: store i32 [[TMP8]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: store i32 1, ptr [[DOTOMP_STRIDE]], align 4 +// CHECK-NEXT: store i32 0, ptr [[DOTOMP_IS_LAST]], align 4 +// CHECK-NEXT: [[TMP9:%.*]] = load ptr, ptr [[TMP]], align 8 +// CHECK-NEXT: store i32 0, ptr [[SUM_V5]], align 4 +// CHECK-NEXT: store ptr [[SUM_V5]], ptr [[_TMP6]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = load ptr, ptr [[_TMP1]], align 8 +// CHECK-NEXT: store i32 1, ptr [[PROD_V7]], align 4 +// CHECK-NEXT: store ptr [[PROD_V7]], ptr [[_TMP8]], align 8 +// CHECK-NEXT: call void @__kmpc_for_static_init_4(ptr @[[GLOB1]], i32 [[TMP0]], i32 34, ptr [[DOTOMP_IS_LAST]], ptr [[DOTOMP_LB]], ptr [[DOTOMP_UB]], ptr [[DOTOMP_STRIDE]], i32 1, i32 1) +// CHECK-NEXT: [[TMP11:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[TMP11]], [[TMP12]] +// CHECK-NEXT: br i1 [[CMP10]], label [[COND_TRUE:%.*]], label [[COND_FALSE:%.*]] +// CHECK: cond.true: +// CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[DOTCAPTURE_EXPR_3]], align 4 +// CHECK-NEXT: br label [[COND_END:%.*]] +// CHECK: cond.false: +// CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: br label [[COND_END]] +// CHECK: cond.end: +// CHECK-NEXT: [[COND:%.*]] = phi i32 [ [[TMP13]], [[COND_TRUE]] ], [ [[TMP14]], [[COND_FALSE]] ] +// CHECK-NEXT: store i32 [[COND]], ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[DOTOMP_LB]], align 4 +// CHECK-NEXT: store i32 [[TMP15]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND:%.*]] +// CHECK: omp.inner.for.cond: +// CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[DOTOMP_UB]], align 4 +// CHECK-NEXT: [[CMP11:%.*]] = icmp sle i32 [[TMP16]], [[TMP17]] +// CHECK-NEXT: br i1 [[CMP11]], label [[OMP_INNER_FOR_BODY:%.*]], label [[OMP_INNER_FOR_END:%.*]] +// CHECK: omp.inner.for.body: +// CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP18]], 1 +// CHECK-NEXT: [[ADD:%.*]] = add nsw i32 0, [[MUL]] +// CHECK-NEXT: store i32 [[ADD]], ptr [[I9]], align 4 +// CHECK-NEXT: [[TMP19:%.*]] = load ptr, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP20:%.*]] = load i32, ptr [[I9]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP20]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 [[IDXPROM]] +// CHECK-NEXT: [[TMP21:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: [[TMP22:%.*]] = load ptr, ptr [[_TMP6]], align 8 +// CHECK-NEXT: [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4 +// CHECK-NEXT: [[ADD12:%.*]] = add nsw i32 [[TMP23]], [[TMP21]] +// CHECK-NEXT: store i32 [[ADD12]], ptr [[TMP22]], align 4 +// CHECK-NEXT: [[TMP24:%.*]] = load ptr, ptr [[V_ADDR]], align 8 +// CHECK-NEXT: [[TMP25:%.*]] = load i32, ptr [[I9]], align 4 +// CHECK-NEXT: [[IDXPROM13:%.*]] = sext i32 [[TMP25]] to i64 +// CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[TMP24]], i64 [[IDXPROM13]] +// CHECK-NEXT: [[TMP26:%.*]] = load i32, ptr [[ARRAYIDX14]], align 4 +// CHECK-NEXT: [[TMP27:%.*]] = load ptr, ptr [[_TMP8]], align 8 +// CHECK-NEXT: [[TMP28:%.*]] = load i32, ptr [[TMP27]], align 4 +// CHECK-NEXT: [[MUL15:%.*]] = mul nsw i32 [[TMP28]], [[TMP26]] +// CHECK-NEXT: store i32 [[MUL15]], ptr [[TMP27]], align 4 +// CHECK-NEXT: br label [[OMP_BODY_CONTINUE:%.*]] +// CHECK: omp.body.continue: +// CHECK-NEXT: br label [[OMP_INNER_FOR_INC:%.*]] +// CHECK: omp.inner.for.inc: +// CHECK-NEXT: [[TMP29:%.*]] = load i32, ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: [[ADD16:%.*]] = add nsw i32 [[TMP29]], 1 +// CHECK-NEXT: store i32 [[ADD16]], ptr [[DOTOMP_IV]], align 4 +// CHECK-NEXT: br label [[OMP_INNER_FOR_COND]] +// CHECK: omp.inner.for.end: +// CHECK-NEXT: br label [[OMP_LOOP_EXIT:%.*]] +// CHECK: omp.loop.exit: +// CHECK-NEXT: call void @__kmpc_for_static_fini(ptr @[[GLOB1]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP30:%.*]] = call i32 @__kmpc_reduce(ptr @[[GLOB2]], i32 [[TMP0]], i32 0, i64 0, ptr [[DOTOMP_REDUCTION_RED_LIST]], ptr @_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func, ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: switch i32 [[TMP30]], label [[DOTOMP_REDUCTION_DEFAULT:%.*]] [ +// CHECK-NEXT: i32 1, label [[DOTOMP_REDUCTION_CASE1:%.*]] +// CHECK-NEXT: i32 2, label [[DOTOMP_REDUCTION_CASE2:%.*]] +// CHECK-NEXT: ] +// CHECK: .omp.reduction.case1: +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.case2: +// CHECK-NEXT: call void @__kmpc_end_reduce(ptr @[[GLOB2]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction.var) +// CHECK-NEXT: br label [[DOTOMP_REDUCTION_DEFAULT]] +// CHECK: .omp.reduction.default: +// CHECK-NEXT: [[TMP31:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[TMP31]], label [[INIT:%.*]], label [[INIT_END:%.*]] +// CHECK: init: +// CHECK-NEXT: store i32 0, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4 +// CHECK-NEXT: br label [[INIT_END]] +// CHECK: init.end: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP32:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4 +// CHECK-NEXT: [[TMP33:%.*]] = load i32, ptr [[SUM_V5]], align 4 +// CHECK-NEXT: [[ADD17:%.*]] = add nsw i32 [[TMP32]], [[TMP33]] +// CHECK-NEXT: store i32 [[ADD17]], ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP34:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.sum_v.sum_v_1392, align 4 +// CHECK-NEXT: store i32 [[TMP34]], ptr [[SUM_V5]], align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP35:%.*]] = load i32, ptr [[TMP9]], align 4 +// CHECK-NEXT: [[TMP36:%.*]] = load i32, ptr [[SUM_V5]], align 4 +// CHECK-NEXT: [[ADD18:%.*]] = add nsw i32 [[TMP35]], [[TMP36]] +// CHECK-NEXT: store i32 [[ADD18]], ptr [[TMP9]], align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP37:%.*]] = icmp eq i32 [[TMP0]], 0 +// CHECK-NEXT: br i1 [[TMP37]], label [[INIT19:%.*]], label [[INIT_END20:%.*]] +// CHECK: init19: +// CHECK-NEXT: store i32 1, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4 +// CHECK-NEXT: br label [[INIT_END20]] +// CHECK: init.end20: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP38:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4 +// CHECK-NEXT: [[TMP39:%.*]] = load i32, ptr [[PROD_V7]], align 4 +// CHECK-NEXT: [[MUL21:%.*]] = mul nsw i32 [[TMP38]], [[TMP39]] +// CHECK-NEXT: store i32 [[MUL21]], ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: [[TMP40:%.*]] = load i32, ptr @.omp.reduction..internal_pivate_.prod_v.prod_v_1461, align 4 +// CHECK-NEXT: store i32 [[TMP40]], ptr [[PROD_V7]], align 4 +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB2]], i32 [[TMP0]]) +// CHECK-NEXT: call void @__kmpc_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: [[TMP41:%.*]] = load i32, ptr [[TMP10]], align 4 +// CHECK-NEXT: [[TMP42:%.*]] = load i32, ptr [[PROD_V7]], align 4 +// CHECK-NEXT: [[MUL22:%.*]] = mul nsw i32 [[TMP41]], [[TMP42]] +// CHECK-NEXT: store i32 [[MUL22]], ptr [[TMP10]], align 4 +// CHECK-NEXT: call void @__kmpc_end_critical(ptr @[[GLOB3]], i32 [[TMP0]], ptr @.gomp_critical_user_.reduction_critical.var) +// CHECK-NEXT: br label [[OMP_PRECOND_END]] +// CHECK: omp.precond.end: +// CHECK-NEXT: call void @__kmpc_barrier(ptr @[[GLOB4]], i32 [[TMP0]]) +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@_Z15do_red_extendediPiRiS0_.omp.reduction.reduction_func +// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]]) #[[ATTR5]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[DOTADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: [[DOTADDR1:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[TMP0]], ptr [[DOTADDR]], align 8 +// CHECK-NEXT: store ptr [[TMP1]], ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load ptr, ptr [[DOTADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = load ptr, ptr [[DOTADDR1]], align 8 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@main +// CHECK-SAME: () #[[ATTR7:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[V:%.*]] = alloca [10 x i32], align 16 +// CHECK-NEXT: [[I:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[SUM_V_EXT:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[PROD_V_EXT:%.*]] = alloca i32, align 4 +// CHECK-NEXT: [[TMP0:%.*]] = call i32 @__kmpc_global_thread_num(ptr @[[GLOB3]]) +// CHECK-NEXT: store i32 0, ptr [[RETVAL]], align 4 +// CHECK-NEXT: store i32 0, ptr [[I]], align 4 +// CHECK-NEXT: br label [[FOR_COND:%.*]] +// CHECK: for.cond: +// CHECK-NEXT: [[TMP1:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[TMP1]], 10 +// CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_END:%.*]] +// CHECK: for.body: +// CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[IDXPROM:%.*]] = sext i32 [[TMP3]] to i64 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x i32], ptr [[V]], i64 0, i64 [[IDXPROM]] +// CHECK-NEXT: store i32 [[TMP2]], ptr [[ARRAYIDX]], align 4 +// CHECK-NEXT: br label [[FOR_INC:%.*]] +// CHECK: for.inc: +// CHECK-NEXT: [[TMP4:%.*]] = load i32, ptr [[I]], align 4 +// CHECK-NEXT: [[INC:%.*]] = add nsw i32 [[TMP4]], 1 +// CHECK-NEXT: store i32 [[INC]], ptr [[I]], align 4 +// CHECK-NEXT: br label [[FOR_COND]], !llvm.loop [[LOOP7:![0-9]+]] +// CHECK: for.end: +// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 1, ptr @main.omp_outlined, ptr [[V]]) +// CHECK-NEXT: store i32 0, ptr [[SUM_V_EXT]], align 4 +// CHECK-NEXT: store i32 1, ptr [[PROD_V_EXT]], align 4 +// CHECK-NEXT: call void @__kmpc_push_num_threads(ptr @[[GLOB3]], i32 [[TMP0]], i32 4) +// CHECK-NEXT: call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB3]], i32 3, ptr @main.omp_outlined.1, ptr [[V]], ptr [[SUM_V_EXT]], ptr [[PROD_V_EXT]]) +// CHECK-NEXT: ret i32 0 + diff --git a/clang/test/OpenMP/for_reduction_messages.cpp b/clang/test/OpenMP/for_reduction_messages.cpp index de28ba2c3be02..2fdac3048c9cd 100644 --- a/clang/test/OpenMP/for_reduction_messages.cpp +++ b/clang/test/OpenMP/for_reduction_messages.cpp @@ -417,10 +417,12 @@ int main(int argc, char **argv) { #pragma omp for reduction(+ : qa[1], qa[0]) for (int i = 0; i < 10; ++i) foo(); +#if defined(_OPENMP) && (_OPENMP <= 202111) #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}} #pragma omp for reduction(+ : fl) // expected-error {{reduction variable must be shared}} for (int i = 0; i < 10; ++i) foo(); +#endif static int m=0; #pragma omp for reduction(+:m) for (int i = 0; i < 10; ++i) diff --git a/clang/test/OpenMP/for_simd_reduction_messages.cpp b/clang/test/OpenMP/for_simd_reduction_messages.cpp index 96b3805b10a86..a9ef6c39cb5d2 100644 --- a/clang/test/OpenMP/for_simd_reduction_messages.cpp +++ b/clang/test/OpenMP/for_simd_reduction_messages.cpp @@ -396,11 +396,11 @@ int main(int argc, char **argv) { #pragma omp for simd reduction(+ : fl) // expected-error {{reduction variable must be shared}} for (int i = 0; i < 10; ++i) foo(); -#endif #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}} #pragma omp for simd reduction(+ : fl) // expected-error {{reduction variable must be shared}} for (int i = 0; i < 10; ++i) foo(); +#endif static int m; #pragma omp for simd reduction(+ : m) for (int i = 0; i < 10; ++i) diff --git a/clang/test/OpenMP/sections_reduction_messages.cpp b/clang/test/OpenMP/sections_reduction_messages.cpp index 42ec3ed6d58e8..8cde6489f325f 100644 --- a/clang/test/OpenMP/sections_reduction_messages.cpp +++ b/clang/test/OpenMP/sections_reduction_messages.cpp @@ -461,12 +461,12 @@ int main(int argc, char **argv) { { foo(); } -#endif #pragma omp parallel reduction(* : fl) // expected-note {{defined as reduction}} #pragma omp sections reduction(+ : fl) // expected-error {{reduction variable must be shared}} { foo(); } +#endif static int m; #pragma omp sections reduction(+ : m) // OK { diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp new file mode 100644 index 0000000000000..9bf3be1e9e45d --- /dev/null +++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp @@ -0,0 +1,194 @@ +// RUN: %libomp-cxx-compile -fopenmp-version=60 && %libomp-run +#include +#include +#include +#include +#include +#include "omp_testsuite.h" + +#define N 10 +class Sum { + int val; + +public: + Sum(int v = 0) : val(v) {} + Sum operator+(const Sum &rhs) const { return Sum(val + rhs.val); } + Sum &operator+=(const Sum &rhs) { + val += rhs.val; + return *this; + } + int getValue() const { return val; } +}; + +// Declare OpenMP reduction +#pragma omp declare reduction(sum_reduction:Sum : omp_out += omp_in) \ + initializer(omp_priv = Sum(0)) + +#pragma omp declare reduction(sum_pctor_reduction:Sum : omp_out += omp_in) \ + initializer(omp_priv = Sum(1)) // non-default ctor + +int checkUserDefinedReduction() { + Sum final_result_udr(0); + Sum final_result_udr_pctor(1); + Sum array_sum[N]; + int error_flag = 0; + int expected_value = 0; + int expected_value_pctor = 0; + for (int i = 0; i < N; ++i) { + array_sum[i] = Sum(i); + expected_value += i; // Calculate expected sum: 0 + 1 + ... + (N-1) + expected_value_pctor += i; + } + int num_threads_for_pctor_calc = 4; // num_threads(4) + int priv_initializer_val_pctor = 1; // initializer(omp_priv = Sum(1)) + expected_value_pctor += + num_threads_for_pctor_calc + priv_initializer_val_pctor; +#pragma omp parallel num_threads(4) private(final_result_udr) private( \ + final_result_udr_pctor) + { +#pragma omp for reduction(sum_reduction : final_result_udr) \ + reduction(sum_pctor_reduction : final_result_udr_pctor) + for (int i = 0; i < N; ++i) { + final_result_udr += array_sum[i]; + final_result_udr_pctor += array_sum[i]; + } + + if (final_result_udr.getValue() != expected_value || + final_result_udr_pctor.getValue() != expected_value_pctor) + error_flag += 1; + } + return error_flag; +} +void performMinMaxRed(int &min_val, int &max_val) { + int input_data[] = {7, 3, 12, 5, 8}; + int n_size = sizeof(input_data) / sizeof(input_data[0]); + min_val = INT_MAX; + max_val = INT_MIN; +#pragma omp for reduction(original(private), min : min_val) \ + reduction(original(private), max : max_val) + for (int i = 0; i < n_size; ++i) { + if (input_data[i] < min_val) + min_val = input_data[i]; + if (input_data[i] > max_val) + max_val = input_data[i]; + } +} +int performComplexReduction() { + double _Complex arr[N]; + double _Complex expected = 0.0 + 0.0 * I; + double _Complex result = 0.0 + 0.0 * I; + int error = 0; + + // Initialize the array and compute serial sum + for (int i = 0; i < N; ++i) { + arr[i] = i - i * I; + expected += arr[i]; + } + double real_sum = 0.0, imag_sum = 0.0; +#pragma omp parallel private(real_sum) private(imag_sum) + { +#pragma omp for reduction(+ : real_sum, imag_sum) + for (int i = 0; i < N; ++i) { + real_sum += creal(arr[i]); + imag_sum += cimag(arr[i]); + } + + result = real_sum + imag_sum * I; + if (cabs(result - expected) > 1e-6) { + error++; + } + } + return error; +} + +std::complex doComplexReduction(std::complex *arr) { + std::complex result(1, 0); + +#pragma omp declare reduction(* : std::complex : omp_out *= omp_in) \ + initializer(omp_priv = std::complex(1, 0)) + +#pragma omp for reduction(original(private), * : result) + for (int i = 0; i < N; ++i) + result *= arr[i]; + + return result; +} + +void performReductions(int n_elements, const int *input_values, + int &sum_val_out, int &prod_val_out, + float &float_sum_val_out) { + // private variables for this thread's reduction. + sum_val_out = 0; + prod_val_out = 1; + float_sum_val_out = 0.0f; + + const float kPiValue = 3.14f; +#pragma omp for reduction(original(private), + : sum_val_out) \ + reduction(original(private), * : prod_val_out) \ + reduction(original(private), + : float_sum_val_out) + for (int i = 0; i < n_elements; ++i) { + sum_val_out += input_values[i]; + prod_val_out *= (i + 1); + float_sum_val_out += kPiValue; + } +} +int main(void) { + int input_array[N]; + int total_errors = 0; + const float kPiVal = 3.14f; + const int kExpectedSum = 45; // Sum of 0..9 + const int kExpectedProd = 3628800; // 10! + const float kExpectedFsum = kPiVal * N; // 3.14f * 10 + const int kExpectedMin = 3; + const int kExpectedMax = 12; + std::complex arr[N]; + std::complex kExpectedComplex(1, 0); + // Initialize the array + for (int i = 1; i <= N; ++i) { + arr[i - 1] = std::complex( + 1.0 + 0.1 * i, 0.5 * i); // Avoid zero to prevent multiplication by zero + kExpectedComplex *= arr[i - 1]; + } + + for (int i = 0; i < N; i++) + input_array[i] = i; +#pragma omp parallel num_threads(4) + { + + int t_sum_v; + int t_prod_v; + float t_fsum_v; + performReductions(N, input_array, t_sum_v, t_prod_v, t_fsum_v); + if (t_sum_v != kExpectedSum) + total_errors++; + if (t_prod_v != kExpectedProd) + total_errors++; + if (t_fsum_v != kExpectedFsum) + total_errors++; + } +#pragma omp parallel num_threads(4) + { + int t_min_v; + int t_max_v; + performMinMaxRed(t_min_v, t_max_v); + if (t_min_v != kExpectedMin) + total_errors++; + if (t_max_v != kExpectedMax) + total_errors++; + } + total_errors += checkUserDefinedReduction(); + total_errors += performComplexReduction(); +#pragma omp parallel num_threads(4) + { + std::complex result(1, 0); + result = doComplexReduction(arr); + if (std::abs(result.real() - kExpectedComplex.real()) > 1e-6 || + std::abs(result.imag() - kExpectedComplex.imag()) > 1e-6) { + total_errors++; + } + } + if (total_errors != 0) + fprintf(stderr, "ERROR: reduction on private variable %d\n", total_errors); + + return total_errors; +} From e44a65ed98ad896d0c0c3b1e10937a19f786b9ef Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Wed, 11 Jun 2025 10:36:12 +0200 Subject: [PATCH 045/851] [flang][OpenMP] Map basic `local` specifiers to `private` clauses (#142735) Starts the effort to map `do concurrent` locality specifiers to OpenMP clauses. This PR adds support for basic specifiers (no `init` or `copy` regions yet). --- .../OpenMP/DoConcurrentConversion.cpp | 55 ++++++++++++++++++- .../locality_specifiers_simple.mlir | 48 ++++++++++++++++ 2 files changed, 101 insertions(+), 2 deletions(-) create mode 100644 flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index 0fdb302fe10ca..283c3052c166c 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -7,9 +7,11 @@ //===----------------------------------------------------------------------===// #include "flang/Optimizer/Builder/FIRBuilder.h" +#include "flang/Optimizer/Builder/Todo.h" #include "flang/Optimizer/Dialect/FIROps.h" #include "flang/Optimizer/OpenMP/Passes.h" #include "flang/Optimizer/OpenMP/Utils.h" +#include "flang/Support/OpenMP-utils.h" #include "mlir/Analysis/SliceAnalysis.h" #include "mlir/Dialect/OpenMP/OpenMPDialect.h" #include "mlir/IR/IRMapping.h" @@ -308,10 +310,47 @@ class DoConcurrentConversion fir::DoConcurrentLoopOp loop, mlir::IRMapping &mapper, const mlir::omp::LoopNestOperands &clauseOps, bool isComposite) const { + mlir::omp::WsloopOperands wsloopClauseOps; + + // For `local` (and `local_init`) opernads, emit corresponding `private` + // clauses and attach these clauses to the workshare loop. + if (!loop.getLocalOperands().empty()) + for (auto [op, sym, arg] : llvm::zip_equal( + loop.getLocalOperands(), + loop.getLocalSymsAttr().getAsRange(), + loop.getRegionLocalArgs())) { + auto localizer = mlir::SymbolTable::lookupNearestSymbolFrom< + fir::LocalitySpecifierOp>(loop, sym); + if (localizer.getLocalitySpecifierType() == + fir::LocalitySpecifierType::LocalInit) + TODO(localizer.getLoc(), + "local_init conversion is not supported yet"); + + if (!localizer.getInitRegion().empty()) + TODO(localizer.getLoc(), + "non-empty `init` regions are not supported yet"); + + auto oldIP = rewriter.saveInsertionPoint(); + rewriter.setInsertionPointAfter(localizer); + auto privatizer = rewriter.create( + localizer.getLoc(), sym.getLeafReference().str() + ".omp", + localizer.getTypeAttr().getValue(), + mlir::omp::DataSharingClauseType::Private); + rewriter.restoreInsertionPoint(oldIP); + + wsloopClauseOps.privateVars.push_back(op); + wsloopClauseOps.privateSyms.push_back( + mlir::SymbolRefAttr::get(privatizer)); + } - auto wsloopOp = rewriter.create(loop.getLoc()); + auto wsloopOp = + rewriter.create(loop.getLoc(), wsloopClauseOps); wsloopOp.setComposite(isComposite); - rewriter.createBlock(&wsloopOp.getRegion()); + + Fortran::common::openmp::EntryBlockArgs wsloopArgs; + wsloopArgs.priv.vars = wsloopClauseOps.privateVars; + Fortran::common::openmp::genEntryBlock(rewriter, wsloopArgs, + wsloopOp.getRegion()); auto loopNestOp = rewriter.create(loop.getLoc(), clauseOps); @@ -324,6 +363,18 @@ class DoConcurrentConversion rewriter.setInsertionPointToEnd(&loopNestOp.getRegion().back()); rewriter.create(loop->getLoc()); + // `local` region arguments are transferred/cloned from the `do concurrent` + // loop to the loopnest op when the region is cloned above. Instead, these + // region arguments should be on the workshare loop's region. + for (auto [wsloopArg, loopNestArg] : + llvm::zip_equal(wsloopOp.getRegion().getArguments(), + loopNestOp.getRegion().getArguments().drop_front( + clauseOps.loopLowerBounds.size()))) + rewriter.replaceAllUsesWith(loopNestArg, wsloopArg); + + for (unsigned i = 0; i < loop.getLocalVars().size(); ++i) + loopNestOp.getRegion().eraseArgument(clauseOps.loopLowerBounds.size()); + return loopNestOp; } diff --git a/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir b/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir new file mode 100644 index 0000000000000..160c1df040680 --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/locality_specifiers_simple.mlir @@ -0,0 +1,48 @@ +// Tests mapping `local` locality specifier to `private` clauses for a simple +// case (not `init` or `copy` regions). + +// RUN: fir-opt --omp-do-concurrent-conversion="map-to=host" %s | FileCheck %s + +fir.local {type = local} @_QFlocal_spec_translationElocal_var_private_f32 : f32 + +func.func @_QPlocal_spec_translation() { + %3 = fir.alloca f32 {bindc_name = "local_var", uniq_name = "_QFlocal_spec_translationElocal_var"} + %4:2 = hlfir.declare %3 {uniq_name = "_QFlocal_spec_translationElocal_var"} : (!fir.ref) -> (!fir.ref, !fir.ref) + + %c4_i32 = arith.constant 4 : index + %c11_i32 = arith.constant 11 : index + %c1 = arith.constant 1 : index + + fir.do_concurrent { + %7 = fir.alloca i32 {bindc_name = "i"} + %8:2 = hlfir.declare %7 {uniq_name = "_QFlocal_spec_translationEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + + fir.do_concurrent.loop (%arg0) = (%c4_i32) to (%c11_i32) step (%c1) + local(@_QFlocal_spec_translationElocal_var_private_f32 %4#0 -> %arg1 : !fir.ref) { + %9 = fir.convert %arg0 : (index) -> i32 + fir.store %9 to %8#0 : !fir.ref + + %10:2 = hlfir.declare %arg1 {uniq_name = "_QFlocal_spec_translationElocal_var"} : (!fir.ref) -> (!fir.ref, !fir.ref) + %cst = arith.constant 4.200000e+01 : f32 + hlfir.assign %cst to %10#0 : f32, !fir.ref + } + } + return +} + +// CHECK: omp.private {type = private} @[[PRIVATIZER:.*local_spec_translationElocal_var.*.omp]] : f32 + +// CHECK: func.func @_QPlocal_spec_translation +// CHECK: %[[LOCAL_VAR:.*]] = fir.alloca f32 {bindc_name = "local_var", {{.*}}} +// CHECK: %[[LOCAL_VAR_DECL:.*]]:2 = hlfir.declare %[[LOCAL_VAR]] +// CHECK: omp.parallel { +// CHECK: omp.wsloop private(@[[PRIVATIZER]] %[[LOCAL_VAR_DECL]]#0 -> %[[LOCAL_ARG:.*]] : !fir.ref) { +// CHECK: omp.loop_nest {{.*}} { +// CHECK: %[[PRIV_DECL:.*]]:2 = hlfir.declare %[[LOCAL_ARG]] +// CHECK: %[[C42:.*]] = arith.constant +// CHECK: hlfir.assign %[[C42]] to %[[PRIV_DECL]]#0 +// CHECK: omp.yield +// CHECK: } +// CHECK: } +// CHECK: omp.terminator +// CHECK: } From 7460c700ae3026d927952f911d0e667de6e0c18b Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 11 Jun 2025 04:42:05 -0400 Subject: [PATCH 046/851] [MemCpyOpt] handle memcpy from memset in more cases (#140954) This aims to reduce the divergence between the initial checks in this function and processMemCpyMemCpyDependence (in particular, adding handling of offsets), with the goal to eventually reduce duplication there and improve this pass in other ways. --- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 74 ++++++++++++------- .../MemCpyOpt/memset-memcpy-oversized.ll | 47 ++++++++++++ .../MemCpyOpt/memset-memcpy-to-2x-memset.ll | 3 +- llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll | 2 +- .../MemCpyOpt/variable-sized-memset-memcpy.ll | 2 +- 5 files changed, 97 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index a78e3770384ae..960001bf880c6 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1364,8 +1364,9 @@ bool MemCpyOptPass::processMemSetMemCpyDependence(MemCpyInst *MemCpy, return true; } -/// Determine whether the instruction has undefined content for the given Size, -/// either because it was freshly alloca'd or started its lifetime. +/// Determine whether the pointer V had only undefined content (due to Def) up +/// to the given Size, either because it was freshly alloca'd or started its +/// lifetime. static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V, MemoryDef *Def, Value *Size) { if (MSSA->isLiveOnEntryDef(Def)) @@ -1400,6 +1401,24 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V, return false; } +// If the memcpy is larger than the previous, but the memory was undef prior to +// that, we can just ignore the tail. Technically we're only interested in the +// bytes from 0..MemSrcOffset and MemSrcLength+MemSrcOffset..CopySize here, but +// as we can't easily represent this location (hasUndefContents uses mustAlias +// which cannot deal with offsets), we use the full 0..CopySize range. +static bool overreadUndefContents(MemorySSA *MSSA, MemCpyInst *MemCpy, + MemIntrinsic *MemSrc, BatchAAResults &BAA) { + Value *CopySize = MemCpy->getLength(); + MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy); + MemoryUseOrDef *MemSrcAccess = MSSA->getMemoryAccess(MemSrc); + MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( + MemSrcAccess->getDefiningAccess(), MemCpyLoc, BAA); + if (auto *MD = dyn_cast(Clobber)) + if (hasUndefContents(MSSA, BAA, MemCpy->getSource(), MD, CopySize)) + return true; + return false; +} + /// Transform memcpy to memset when its source was just memset. /// In other words, turn: /// \code @@ -1415,19 +1434,25 @@ static bool hasUndefContents(MemorySSA *MSSA, BatchAAResults &AA, Value *V, bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, MemSetInst *MemSet, BatchAAResults &BAA) { - // Make sure that memcpy(..., memset(...), ...), that is we are memsetting and - // memcpying from the same address. Otherwise it is hard to reason about. - if (!BAA.isMustAlias(MemSet->getRawDest(), MemCpy->getRawSource())) - return false; - Value *MemSetSize = MemSet->getLength(); Value *CopySize = MemCpy->getLength(); - if (MemSetSize != CopySize) { - // Make sure the memcpy doesn't read any more than what the memset wrote. - // Don't worry about sizes larger than i64. + int64_t MOffset = 0; + const DataLayout &DL = MemCpy->getModule()->getDataLayout(); + // We can only transforms memcpy's where the dest of one is the source of the + // other, or the memory transfer has a known offset from the memset. + if (MemCpy->getSource() != MemSet->getDest()) { + std::optional Offset = + MemCpy->getSource()->getPointerOffsetFrom(MemSet->getDest(), DL); + if (!Offset || *Offset < 0) + return false; + MOffset = *Offset; + } - // A known memset size is required. + if (MOffset != 0 || MemSetSize != CopySize) { + // Make sure the memcpy doesn't read any more than what the memset wrote, + // other than undef. Don't worry about sizes larger than i64. A known memset + // size is required. auto *CMemSetSize = dyn_cast(MemSetSize); if (!CMemSetSize) return false; @@ -1436,23 +1461,18 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, auto *CCopySize = dyn_cast(CopySize); if (!CCopySize) return false; - if (CCopySize->getZExtValue() > CMemSetSize->getZExtValue()) { - // If the memcpy is larger than the memset, but the memory was undef prior - // to the memset, we can just ignore the tail. Technically we're only - // interested in the bytes from MemSetSize..CopySize here, but as we can't - // easily represent this location, we use the full 0..CopySize range. - MemoryLocation MemCpyLoc = MemoryLocation::getForSource(MemCpy); - bool CanReduceSize = false; - MemoryUseOrDef *MemSetAccess = MSSA->getMemoryAccess(MemSet); - MemoryAccess *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( - MemSetAccess->getDefiningAccess(), MemCpyLoc, BAA); - if (auto *MD = dyn_cast(Clobber)) - if (hasUndefContents(MSSA, BAA, MemCpy->getSource(), MD, CopySize)) - CanReduceSize = true; - - if (!CanReduceSize) + if (CCopySize->getZExtValue() + MOffset > CMemSetSize->getZExtValue()) { + if (!overreadUndefContents(MSSA, MemCpy, MemSet, BAA)) return false; - CopySize = MemSetSize; + // Clip the memcpy to the bounds of the memset + if (MOffset == 0) + CopySize = MemSetSize; + else + CopySize = + ConstantInt::get(CopySize->getType(), + CMemSetSize->getZExtValue() <= (uint64_t)MOffset + ? 0 + : CMemSetSize->getZExtValue() - MOffset); } } diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll index 1c3896407e950..0c16f34590fc7 100644 --- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll +++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-oversized.ll @@ -187,6 +187,53 @@ define void @test_write_before_memset_in_both_regions(ptr %result) { ret void } +define void @test_negative_offset_memset(ptr %result) { +; CHECK-LABEL: @test_negative_offset_memset( +; CHECK-NEXT: [[A1:%.*]] = alloca [16 x i8], align 8 +; CHECK-NEXT: [[A:%.*]] = getelementptr i8, ptr [[A1]], i32 4 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[RESULT:%.*]], ptr align 8 [[A1]], i64 12, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca [ 16 x i8 ], align 8 + %b = getelementptr i8, ptr %a, i32 4 + call void @llvm.memset.p0.i64(ptr align 8 %b, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 12, i1 false) + ret void +} + +define void @test_offset_memsetcpy(ptr %result) { +; CHECK-LABEL: @test_offset_memsetcpy( +; CHECK-NEXT: [[A1:%.*]] = alloca [16 x i8], align 8 +; CHECK-NEXT: [[A:%.*]] = getelementptr i8, ptr [[A1]], i32 4 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[A1]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[RESULT:%.*]], i8 0, i64 8, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca [ 16 x i8 ], align 8 + %b = getelementptr i8, ptr %a, i32 4 + call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %b, i64 12, i1 false) + ret void +} + +define void @test_two_memset(ptr %result) { +; CHECK-LABEL: @test_two_memset( +; CHECK-NEXT: [[A:%.*]] = alloca [16 x i8], align 8 +; CHECK-NEXT: [[B:%.*]] = getelementptr i8, ptr [[A]], i32 12 +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[A]], i8 0, i64 12, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[B]], i8 1, i64 4, i1 false) +; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[RESULT:%.*]], ptr align 8 [[A]], i64 16, i1 false) +; CHECK-NEXT: ret void +; + %a = alloca [ 16 x i8 ], align 8 + %b = getelementptr i8, ptr %a, i32 12 + call void @llvm.memset.p0.i64(ptr align 8 %a, i8 0, i64 12, i1 false) + call void @llvm.memset.p0.i64(ptr align 8 %b, i8 1, i64 4, i1 false) + call void @llvm.memcpy.p0.p0.i64(ptr %result, ptr align 8 %a, i64 16, i1 false) + ret void +} + declare ptr @malloc(i64) declare void @free(ptr) diff --git a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll index 47474e8dac051..18488f03a2d88 100644 --- a/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll +++ b/llvm/test/Transforms/MemCpyOpt/memset-memcpy-to-2x-memset.ll @@ -73,11 +73,10 @@ define void @test_different_source_gep(ptr %dst1, ptr %dst2, i8 %c) { ; CHECK-LABEL: @test_different_source_gep( ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[DST1:%.*]], i8 [[C:%.*]], i64 128, i1 false) ; CHECK-NEXT: [[P:%.*]] = getelementptr i8, ptr [[DST1]], i64 64 -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[DST2:%.*]], ptr [[P]], i64 64, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr [[DST2:%.*]], i8 [[C]], i64 64, i1 false) ; CHECK-NEXT: ret void ; call void @llvm.memset.p0.i64(ptr %dst1, i8 %c, i64 128, i1 false) - ; FIXME: We could optimize this as well. %p = getelementptr i8, ptr %dst1, i64 64 call void @llvm.memcpy.p0.p0.i64(ptr %dst2, ptr %p, i64 64, i1 false) ret void diff --git a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll index 5e13432746bf7..0e312bc42d463 100644 --- a/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll +++ b/llvm/test/Transforms/MemCpyOpt/mixed-sizes.ll @@ -19,7 +19,7 @@ define i32 @foo(i1 %z) { ; CHECK: for.body3.lr.ph: ; CHECK-NEXT: br label [[FOR_INC7_1]] ; CHECK: for.inc7.1: -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 4 [[A]], ptr align 4 [[SCEVGEP]], i64 4, i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[A]], i8 0, i64 4, i1 false) ; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr [[A]], align 4 ; CHECK-NEXT: ret i32 [[TMP2]] ; diff --git a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll index a834d2465dfa5..d5b1ab9b2f299 100644 --- a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll @@ -18,7 +18,7 @@ define void @test(ptr %src, i8 %c, i64 %size) { ret void } -; Differing sizes, so left as it is. +; Differing sizes, but would be UB if size1 < size2 since the memcpy would reference outside of the first alloca define void @negative_test(ptr %src, i8 %c, i64 %size1, i64 %size2) { ; CHECK-LABEL: @negative_test( ; CHECK-NEXT: [[DST1:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1 From ddb771ecfd12cab8d323a4e64e64b965883585de Mon Sep 17 00:00:00 2001 From: David Green Date: Wed, 11 Jun 2025 09:50:26 +0100 Subject: [PATCH 047/851] [AArch64][Clang] Update new Neon vector element types. (#142760) This updates the element types used in the new __Int8x8_t types added in #126945, mostly to allow C++ name mangling in ItaniumMangling mangleAArch64VectorBase to work correctly. Char is replaced by SignedCharTy or UnsignedCharTy as required and Float16Ty is better using HalfTy to match the vector types. Same for Long types. --- .../include/clang/Basic/AArch64ACLETypes.def | 22 +- clang/test/AST/ast-dump-aarch64-neon-types.c | 22 +- clang/test/CodeGen/AArch64/mixed-neon-types.c | 559 ++++++++++++++++-- 3 files changed, 538 insertions(+), 65 deletions(-) diff --git a/clang/include/clang/Basic/AArch64ACLETypes.def b/clang/include/clang/Basic/AArch64ACLETypes.def index 9acfd693288cf..bbe0c85f9ffbe 100644 --- a/clang/include/clang/Basic/AArch64ACLETypes.def +++ b/clang/include/clang/Basic/AArch64ACLETypes.def @@ -123,31 +123,31 @@ //===- Neon Vector point types --------------------------------------------===// -NEON_VECTOR_TYPE(__Int8x8_t, CharTy, 8, 8, VectorKind::Neon) +NEON_VECTOR_TYPE(__Int8x8_t, SignedCharTy, 8, 8, VectorKind::Neon) NEON_VECTOR_TYPE(__Int16x4_t, ShortTy, 16, 4, VectorKind::Neon) NEON_VECTOR_TYPE(__Int32x2_t, IntTy, 32, 2, VectorKind::Neon) -NEON_VECTOR_TYPE(__Uint8x8_t, CharTy, 8, 8, VectorKind::Neon) +NEON_VECTOR_TYPE(__Uint8x8_t, UnsignedCharTy, 8, 8, VectorKind::Neon) NEON_VECTOR_TYPE(__Uint16x4_t, UnsignedShortTy, 16, 4, VectorKind::Neon) NEON_VECTOR_TYPE(__Uint32x2_t, UnsignedIntTy, 32, 2, VectorKind::Neon) -NEON_VECTOR_TYPE(__Float16x4_t, Float16Ty, 16, 4, VectorKind::Neon) +NEON_VECTOR_TYPE(__Float16x4_t, HalfTy, 16, 4, VectorKind::Neon) NEON_VECTOR_TYPE(__Float32x2_t, FloatTy, 32, 2, VectorKind::Neon) -NEON_VECTOR_TYPE(__Poly8x8_t, CharTy, 8, 8, VectorKind::NeonPoly) +NEON_VECTOR_TYPE(__Poly8x8_t, UnsignedCharTy, 8, 8, VectorKind::NeonPoly) NEON_VECTOR_TYPE(__Poly16x4_t, UnsignedShortTy, 16, 4, VectorKind::NeonPoly) NEON_VECTOR_TYPE(__Bfloat16x4_t, BFloat16Ty, 16, 4, VectorKind::Neon) -NEON_VECTOR_TYPE(__Int8x16_t, CharTy, 8, 16, VectorKind::Neon) +NEON_VECTOR_TYPE(__Int8x16_t, SignedCharTy, 8, 16, VectorKind::Neon) NEON_VECTOR_TYPE(__Int16x8_t, ShortTy, 16, 8, VectorKind::Neon) NEON_VECTOR_TYPE(__Int32x4_t, IntTy, 32, 4, VectorKind::Neon) -NEON_VECTOR_TYPE(__Int64x2_t, LongLongTy, 64, 2, VectorKind::Neon) -NEON_VECTOR_TYPE(__Uint8x16_t, CharTy, 8, 16, VectorKind::Neon) +NEON_VECTOR_TYPE(__Int64x2_t, LongTy, 64, 2, VectorKind::Neon) +NEON_VECTOR_TYPE(__Uint8x16_t, UnsignedCharTy, 8, 16, VectorKind::Neon) NEON_VECTOR_TYPE(__Uint16x8_t, UnsignedShortTy, 16, 8, VectorKind::Neon) NEON_VECTOR_TYPE(__Uint32x4_t, UnsignedIntTy, 32, 4, VectorKind::Neon) -NEON_VECTOR_TYPE(__Uint64x2_t, UnsignedLongLongTy, 64, 2, VectorKind::Neon) -NEON_VECTOR_TYPE(__Float16x8_t, Float16Ty, 16, 8, VectorKind::Neon) +NEON_VECTOR_TYPE(__Uint64x2_t, UnsignedLongTy, 64, 2, VectorKind::Neon) +NEON_VECTOR_TYPE(__Float16x8_t, HalfTy, 16, 8, VectorKind::Neon) NEON_VECTOR_TYPE(__Float32x4_t, FloatTy, 32, 4, VectorKind::Neon) NEON_VECTOR_TYPE(__Float64x2_t, DoubleTy, 64, 2, VectorKind::Neon) -NEON_VECTOR_TYPE(__Poly8x16_t, CharTy, 8, 16, VectorKind::NeonPoly) +NEON_VECTOR_TYPE(__Poly8x16_t, UnsignedCharTy, 8, 16, VectorKind::NeonPoly) NEON_VECTOR_TYPE(__Poly16x8_t, UnsignedShortTy, 16, 8, VectorKind::NeonPoly) -NEON_VECTOR_TYPE(__Poly64x2_t, UnsignedLongLongTy, 64, 2, VectorKind::NeonPoly) +NEON_VECTOR_TYPE(__Poly64x2_t, UnsignedLongTy, 64, 2, VectorKind::NeonPoly) NEON_VECTOR_TYPE(__Bfloat16x8_t, BFloat16Ty, 16, 8, VectorKind::Neon) NEON_VECTOR_TYPE(__Mfloat8x8_t, MFloat8Ty, 8, 8, VectorKind::Neon) NEON_VECTOR_TYPE(__Mfloat8x16_t, MFloat8Ty, 8, 16, VectorKind::Neon) diff --git a/clang/test/AST/ast-dump-aarch64-neon-types.c b/clang/test/AST/ast-dump-aarch64-neon-types.c index 16255cd51c9d8..f509bd880c14b 100644 --- a/clang/test/AST/ast-dump-aarch64-neon-types.c +++ b/clang/test/AST/ast-dump-aarch64-neon-types.c @@ -9,7 +9,7 @@ // RUN: %clang_cc1 -verify -verify-ignore-unexpected=note -triple arm-linux-gnu %s -x c++ __Int8x8_t Int8x8; -// CHECK: Int8x8 '__Int8x8_t':'__attribute__((neon_vector_type(8))) char' +// CHECK: Int8x8 '__Int8x8_t':'__attribute__((neon_vector_type(8))) signed char' // expected-error@-2{{unknown type name '__Int8x8_t'}} __Int16x4_t Int16x4; @@ -21,7 +21,7 @@ __Int32x2_t Int32x2; // expected-error@-2{{unknown type name '__Int32x2_t'}} __Uint8x8_t Uint8x8; -// CHECK: Uint8x8 '__Uint8x8_t':'__attribute__((neon_vector_type(8))) char' +// CHECK: Uint8x8 '__Uint8x8_t':'__attribute__((neon_vector_type(8))) unsigned char' // expected-error@-2{{unknown type name '__Uint8x8_t'}} __Uint16x4_t Uint16x4; @@ -33,7 +33,7 @@ __Uint32x2_t Uint32x2; // expected-error@-2{{unknown type name '__Uint32x2_t'}} __Float16x4_t Float16x4; -// CHECK: Float16x4 '__Float16x4_t':'__attribute__((neon_vector_type(4))) _Float16' +// CHECK: Float16x4 '__Float16x4_t':'__attribute__((neon_vector_type(4))) __fp16' // expected-error@-2{{unknown type name '__Float16x4_t'}} __Float32x2_t Float32x2; @@ -41,7 +41,7 @@ __Float32x2_t Float32x2; // expected-error@-2{{unknown type name '__Float32x2_t'}} __Poly8x8_t Poly8x8; -// CHECK: Poly8x8 '__Poly8x8_t':'__attribute__((neon_polyvector_type(8))) char' +// CHECK: Poly8x8 '__Poly8x8_t':'__attribute__((neon_polyvector_type(8))) unsigned char' // expected-error@-2{{unknown type name '__Poly8x8_t'}} __Poly16x4_t Poly16x4; @@ -53,7 +53,7 @@ __Bfloat16x4_t Bfloat16x4; // expected-error@-2{{unknown type name '__Bfloat16x4_t'}} __Int8x16_t Int8x16; -// CHECK: Int8x16 '__Int8x16_t':'__attribute__((neon_vector_type(16))) char' +// CHECK: Int8x16 '__Int8x16_t':'__attribute__((neon_vector_type(16))) signed char' // expected-error@-2{{unknown type name '__Int8x16_t'}} __Int16x8_t Int16x8; @@ -65,11 +65,11 @@ __Int32x4_t Int32x4; // expected-error@-2{{unknown type name '__Int32x4_t'}} __Int64x2_t Int64x2; -// CHECK: Int64x2 '__Int64x2_t':'__attribute__((neon_vector_type(2))) long long' +// CHECK: Int64x2 '__Int64x2_t':'__attribute__((neon_vector_type(2))) long' // expected-error@-2{{unknown type name '__Int64x2_t'}} __Uint8x16_t Uint8x16; -// CHECK: Uint8x16 '__Uint8x16_t':'__attribute__((neon_vector_type(16))) char' +// CHECK: Uint8x16 '__Uint8x16_t':'__attribute__((neon_vector_type(16))) unsigned char' // expected-error@-2{{unknown type name '__Uint8x16_t'}} __Uint16x8_t Uint16x8; @@ -81,11 +81,11 @@ __Uint32x4_t Uint32x4; // expected-error@-2{{unknown type name '__Uint32x4_t'}} __Uint64x2_t Uint64x2; -// CHECK: Uint64x2 '__Uint64x2_t':'__attribute__((neon_vector_type(2))) unsigned long long' +// CHECK: Uint64x2 '__Uint64x2_t':'__attribute__((neon_vector_type(2))) unsigned long' // expected-error@-2{{unknown type name '__Uint64x2_t'}} __Float16x8_t Float16x8; -// CHECK: Float16x8 '__Float16x8_t':'__attribute__((neon_vector_type(8))) _Float16' +// CHECK: Float16x8 '__Float16x8_t':'__attribute__((neon_vector_type(8))) __fp16' // expected-error@-2{{unknown type name '__Float16x8_t'}} __Float32x4_t Float32x4; @@ -97,7 +97,7 @@ __Float64x2_t Float64x2; // expected-error@-2{{unknown type name '__Float64x2_t'}} __Poly8x16_t Poly8x16; -// CHECK: Poly8x16 '__Poly8x16_t':'__attribute__((neon_polyvector_type(16))) char' +// CHECK: Poly8x16 '__Poly8x16_t':'__attribute__((neon_polyvector_type(16))) unsigned char' // expected-error@-2{{unknown type name '__Poly8x16_t'}} __Poly16x8_t Poly16x8; @@ -105,7 +105,7 @@ __Poly16x8_t Poly16x8; // expected-error@-2{{unknown type name '__Poly16x8_t'}} __Poly64x2_t Poly64x2; -// CHECK: Poly64x2 '__Poly64x2_t':'__attribute__((neon_polyvector_type(2))) unsigned long long' +// CHECK: Poly64x2 '__Poly64x2_t':'__attribute__((neon_polyvector_type(2))) unsigned long' // expected-error@-2{{unknown type name '__Poly64x2_t'}} __Bfloat16x8_t Bfloat16x8; diff --git a/clang/test/CodeGen/AArch64/mixed-neon-types.c b/clang/test/CodeGen/AArch64/mixed-neon-types.c index 47681a507d715..34fbe499f4052 100644 --- a/clang/test/CodeGen/AArch64/mixed-neon-types.c +++ b/clang/test/CodeGen/AArch64/mixed-neon-types.c @@ -3,23 +3,23 @@ // RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -x c++ %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK-CPP // REQUIRES: aarch64-registered-target -typedef __Uint32x4_t X; +typedef __Uint8x16_t X; -// CHECK-C-LABEL: define dso_local <4 x i32> @test( -// CHECK-C-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-C-LABEL: define dso_local <16 x i8> @test( +// CHECK-C-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-C-NEXT: [[ENTRY:.*:]] -// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <4 x i32>, align 16 -// CHECK-C-NEXT: store <4 x i32> [[X]], ptr [[X_ADDR]], align 16 -// CHECK-C-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16 -// CHECK-C-NEXT: ret <4 x i32> [[TMP0]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-C-NEXT: store <16 x i8> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <16 x i8> [[TMP0]] // -// CHECK-CPP-LABEL: define dso_local noundef <4 x i32> @_Z4test12__Uint32x4_t( -// CHECK-CPP-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] { +// CHECK-CPP-LABEL: define dso_local noundef <16 x i8> @_Z4test12__Uint8x16_t( +// CHECK-CPP-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] { // CHECK-CPP-NEXT: [[ENTRY:.*:]] -// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <4 x i32>, align 16 -// CHECK-CPP-NEXT: store <4 x i32> [[X]], ptr [[X_ADDR]], align 16 -// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16 -// CHECK-CPP-NEXT: ret <4 x i32> [[TMP0]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-CPP-NEXT: store <16 x i8> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <16 x i8> [[TMP0]] // X test(X x) { return x; @@ -28,47 +28,520 @@ X test(X x) { #include // CHECK-C-LABEL: define dso_local <16 x i8> @testboth( -// CHECK-C-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] { // CHECK-C-NEXT: [[ENTRY:.*:]] // CHECK-C-NEXT: [[__P0_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // CHECK-C-NEXT: [[__P1_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // CHECK-C-NEXT: [[__RET_I:%.*]] = alloca <16 x i8>, align 16 -// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <4 x i32>, align 16 -// CHECK-C-NEXT: store <4 x i32> [[X]], ptr [[X_ADDR]], align 16 -// CHECK-C-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16 -// CHECK-C-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-C-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16 -// CHECK-C-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> -// CHECK-C-NEXT: store <16 x i8> [[TMP1]], ptr [[__P0_ADDR_I]], align 16 -// CHECK-C-NEXT: store <16 x i8> [[TMP3]], ptr [[__P1_ADDR_I]], align 16 -// CHECK-C-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 -// CHECK-C-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__P1_ADDR_I]], align 16 -// CHECK-C-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[TMP4]], [[TMP5]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-C-NEXT: store <16 x i8> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: store <16 x i8> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// CHECK-C-NEXT: store <16 x i8> [[TMP1]], ptr [[__P1_ADDR_I]], align 16 +// CHECK-C-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// CHECK-C-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[__P1_ADDR_I]], align 16 +// CHECK-C-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[TMP2]], [[TMP3]] // CHECK-C-NEXT: store <16 x i8> [[ADD_I]], ptr [[__RET_I]], align 16 -// CHECK-C-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr [[__RET_I]], align 16 -// CHECK-C-NEXT: ret <16 x i8> [[TMP6]] +// CHECK-C-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr [[__RET_I]], align 16 +// CHECK-C-NEXT: ret <16 x i8> [[TMP4]] // -// CHECK-CPP-LABEL: define dso_local noundef <16 x i8> @_Z8testboth12__Uint32x4_t( -// CHECK-CPP-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-LABEL: define dso_local noundef <16 x i8> @_Z8testboth12__Uint8x16_t( +// CHECK-CPP-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] { // CHECK-CPP-NEXT: [[ENTRY:.*:]] // CHECK-CPP-NEXT: [[__P0_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // CHECK-CPP-NEXT: [[__P1_ADDR_I:%.*]] = alloca <16 x i8>, align 16 // CHECK-CPP-NEXT: [[__RET_I:%.*]] = alloca <16 x i8>, align 16 -// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <4 x i32>, align 16 -// CHECK-CPP-NEXT: store <4 x i32> [[X]], ptr [[X_ADDR]], align 16 -// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16 -// CHECK-CPP-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> -// CHECK-CPP-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16 -// CHECK-CPP-NEXT: [[TMP3:%.*]] = bitcast <4 x i32> [[TMP2]] to <16 x i8> -// CHECK-CPP-NEXT: store <16 x i8> [[TMP1]], ptr [[__P0_ADDR_I]], align 16 -// CHECK-CPP-NEXT: store <16 x i8> [[TMP3]], ptr [[__P1_ADDR_I]], align 16 -// CHECK-CPP-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 -// CHECK-CPP-NEXT: [[TMP5:%.*]] = load <16 x i8>, ptr [[__P1_ADDR_I]], align 16 -// CHECK-CPP-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[TMP4]], [[TMP5]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-CPP-NEXT: store <16 x i8> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: store <16 x i8> [[TMP0]], ptr [[__P0_ADDR_I]], align 16 +// CHECK-CPP-NEXT: store <16 x i8> [[TMP1]], ptr [[__P1_ADDR_I]], align 16 +// CHECK-CPP-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr [[__P0_ADDR_I]], align 16 +// CHECK-CPP-NEXT: [[TMP3:%.*]] = load <16 x i8>, ptr [[__P1_ADDR_I]], align 16 +// CHECK-CPP-NEXT: [[ADD_I:%.*]] = add <16 x i8> [[TMP2]], [[TMP3]] // CHECK-CPP-NEXT: store <16 x i8> [[ADD_I]], ptr [[__RET_I]], align 16 -// CHECK-CPP-NEXT: [[TMP6:%.*]] = load <16 x i8>, ptr [[__RET_I]], align 16 -// CHECK-CPP-NEXT: ret <16 x i8> [[TMP6]] +// CHECK-CPP-NEXT: [[TMP4:%.*]] = load <16 x i8>, ptr [[__RET_I]], align 16 +// CHECK-CPP-NEXT: ret <16 x i8> [[TMP4]] // -int8x16_t testboth(X x) { +uint8x16_t testboth(X x) { return vaddq_u8(x, x); } + +// CHECK-C-LABEL: define dso_local <8 x i8> @test__Int8x8_t( +// CHECK-C-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-C-NEXT: store <8 x i8> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: ret <8 x i8> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <8 x i8> @_Z14test__Int8x8_t10__Int8x8_t( +// CHECK-CPP-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-CPP-NEXT: store <8 x i8> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: ret <8 x i8> [[TMP0]] +// +int8x8_t test__Int8x8_t(__Int8x8_t x) { return x; } +// CHECK-C-LABEL: define dso_local <4 x i16> @test__Int16x4_t( +// CHECK-C-SAME: <4 x i16> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <4 x i16>, align 8 +// CHECK-C-NEXT: store <4 x i16> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: ret <4 x i16> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <4 x i16> @_Z15test__Int16x4_t11__Int16x4_t( +// CHECK-CPP-SAME: <4 x i16> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <4 x i16>, align 8 +// CHECK-CPP-NEXT: store <4 x i16> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: ret <4 x i16> [[TMP0]] +// +int16x4_t test__Int16x4_t(__Int16x4_t x) { return x; } +// CHECK-C-LABEL: define dso_local <2 x i32> @test__Int32x2_t( +// CHECK-C-SAME: <2 x i32> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <2 x i32>, align 8 +// CHECK-C-NEXT: store <2 x i32> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: ret <2 x i32> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <2 x i32> @_Z15test__Int32x2_t11__Int32x2_t( +// CHECK-CPP-SAME: <2 x i32> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <2 x i32>, align 8 +// CHECK-CPP-NEXT: store <2 x i32> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: ret <2 x i32> [[TMP0]] +// +int32x2_t test__Int32x2_t(__Int32x2_t x) { return x; } +// CHECK-C-LABEL: define dso_local <8 x i8> @test__Uint8x8_t( +// CHECK-C-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-C-NEXT: store <8 x i8> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: ret <8 x i8> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <8 x i8> @_Z15test__Uint8x8_t11__Uint8x8_t( +// CHECK-CPP-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-CPP-NEXT: store <8 x i8> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: ret <8 x i8> [[TMP0]] +// +uint8x8_t test__Uint8x8_t(__Uint8x8_t x) { return x; } +// CHECK-C-LABEL: define dso_local <4 x i16> @test__Uint16x4_t( +// CHECK-C-SAME: <4 x i16> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <4 x i16>, align 8 +// CHECK-C-NEXT: store <4 x i16> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: ret <4 x i16> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <4 x i16> @_Z16test__Uint16x4_t12__Uint16x4_t( +// CHECK-CPP-SAME: <4 x i16> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <4 x i16>, align 8 +// CHECK-CPP-NEXT: store <4 x i16> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: ret <4 x i16> [[TMP0]] +// +uint16x4_t test__Uint16x4_t(__Uint16x4_t x) { return x; } +// CHECK-C-LABEL: define dso_local <2 x i32> @test__Uint32x2_t( +// CHECK-C-SAME: <2 x i32> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <2 x i32>, align 8 +// CHECK-C-NEXT: store <2 x i32> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: ret <2 x i32> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <2 x i32> @_Z16test__Uint32x2_t12__Uint32x2_t( +// CHECK-CPP-SAME: <2 x i32> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <2 x i32>, align 8 +// CHECK-CPP-NEXT: store <2 x i32> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <2 x i32>, ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: ret <2 x i32> [[TMP0]] +// +uint32x2_t test__Uint32x2_t(__Uint32x2_t x) { return x; } +// CHECK-C-LABEL: define dso_local <4 x half> @test__Float16x4_t( +// CHECK-C-SAME: <4 x half> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <4 x half>, align 8 +// CHECK-C-NEXT: store <4 x half> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: ret <4 x half> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <4 x half> @_Z17test__Float16x4_t13__Float16x4_t( +// CHECK-CPP-SAME: <4 x half> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <4 x half>, align 8 +// CHECK-CPP-NEXT: store <4 x half> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <4 x half>, ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: ret <4 x half> [[TMP0]] +// +float16x4_t test__Float16x4_t(__Float16x4_t x) { return x; } +// CHECK-C-LABEL: define dso_local <2 x float> @test__Float32x2_t( +// CHECK-C-SAME: <2 x float> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <2 x float>, align 8 +// CHECK-C-NEXT: store <2 x float> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: ret <2 x float> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <2 x float> @_Z17test__Float32x2_t13__Float32x2_t( +// CHECK-CPP-SAME: <2 x float> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <2 x float>, align 8 +// CHECK-CPP-NEXT: store <2 x float> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: ret <2 x float> [[TMP0]] +// +float32x2_t test__Float32x2_t(__Float32x2_t x) { return x; } +// CHECK-C-LABEL: define dso_local <8 x i8> @test__Poly8x8_t( +// CHECK-C-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-C-NEXT: store <8 x i8> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: ret <8 x i8> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <8 x i8> @_Z15test__Poly8x8_t11__Poly8x8_t( +// CHECK-CPP-SAME: <8 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-CPP-NEXT: store <8 x i8> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: ret <8 x i8> [[TMP0]] +// +poly8x8_t test__Poly8x8_t(__Poly8x8_t x) { return x; } +// CHECK-C-LABEL: define dso_local <4 x i16> @test__Poly16x4_t( +// CHECK-C-SAME: <4 x i16> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <4 x i16>, align 8 +// CHECK-C-NEXT: store <4 x i16> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: ret <4 x i16> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <4 x i16> @_Z16test__Poly16x4_t12__Poly16x4_t( +// CHECK-CPP-SAME: <4 x i16> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <4 x i16>, align 8 +// CHECK-CPP-NEXT: store <4 x i16> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <4 x i16>, ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: ret <4 x i16> [[TMP0]] +// +poly16x4_t test__Poly16x4_t(__Poly16x4_t x) { return x; } +// CHECK-C-LABEL: define dso_local <4 x bfloat> @test__Bfloat16x4_t( +// CHECK-C-SAME: <4 x bfloat> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <4 x bfloat>, align 8 +// CHECK-C-NEXT: store <4 x bfloat> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <4 x bfloat>, ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: ret <4 x bfloat> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <4 x bfloat> @_Z18test__Bfloat16x4_t14__Bfloat16x4_t( +// CHECK-CPP-SAME: <4 x bfloat> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <4 x bfloat>, align 8 +// CHECK-CPP-NEXT: store <4 x bfloat> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <4 x bfloat>, ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: ret <4 x bfloat> [[TMP0]] +// +bfloat16x4_t test__Bfloat16x4_t(__Bfloat16x4_t x) { return x; } +// CHECK-C-LABEL: define dso_local <16 x i8> @test__Int8x16_t( +// CHECK-C-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-C-NEXT: store <16 x i8> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <16 x i8> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <16 x i8> @_Z15test__Int8x16_t11__Int8x16_t( +// CHECK-CPP-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-CPP-NEXT: store <16 x i8> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test__Int8x16_t(__Int8x16_t x) { return x; } +// CHECK-C-LABEL: define dso_local <8 x i16> @test__Int16x8_t( +// CHECK-C-SAME: <8 x i16> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <8 x i16>, align 16 +// CHECK-C-NEXT: store <8 x i16> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <8 x i16> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <8 x i16> @_Z15test__Int16x8_t11__Int16x8_t( +// CHECK-CPP-SAME: <8 x i16> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <8 x i16>, align 16 +// CHECK-CPP-NEXT: store <8 x i16> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test__Int16x8_t(__Int16x8_t x) { return x; } +// CHECK-C-LABEL: define dso_local <4 x i32> @test__Int32x4_t( +// CHECK-C-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <4 x i32>, align 16 +// CHECK-C-NEXT: store <4 x i32> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <4 x i32> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <4 x i32> @_Z15test__Int32x4_t11__Int32x4_t( +// CHECK-CPP-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <4 x i32>, align 16 +// CHECK-CPP-NEXT: store <4 x i32> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test__Int32x4_t(__Int32x4_t x) { return x; } +// CHECK-C-LABEL: define dso_local <2 x i64> @test__Int64x2_t( +// CHECK-C-SAME: <2 x i64> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-C-NEXT: store <2 x i64> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <2 x i64> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <2 x i64> @_Z15test__Int64x2_t11__Int64x2_t( +// CHECK-CPP-SAME: <2 x i64> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-CPP-NEXT: store <2 x i64> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <2 x i64> [[TMP0]] +// +int64x2_t test__Int64x2_t(__Int64x2_t x) { return x; } +// CHECK-C-LABEL: define dso_local <16 x i8> @test__Uint8x16_t( +// CHECK-C-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-C-NEXT: store <16 x i8> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <16 x i8> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <16 x i8> @_Z16test__Uint8x16_t12__Uint8x16_t( +// CHECK-CPP-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-CPP-NEXT: store <16 x i8> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test__Uint8x16_t(__Uint8x16_t x) { return x; } +// CHECK-C-LABEL: define dso_local <8 x i16> @test__Uint16x8_t( +// CHECK-C-SAME: <8 x i16> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <8 x i16>, align 16 +// CHECK-C-NEXT: store <8 x i16> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <8 x i16> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <8 x i16> @_Z16test__Uint16x8_t12__Uint16x8_t( +// CHECK-CPP-SAME: <8 x i16> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <8 x i16>, align 16 +// CHECK-CPP-NEXT: store <8 x i16> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <8 x i16> [[TMP0]] +// +uint16x8_t test__Uint16x8_t(__Uint16x8_t x) { return x; } +// CHECK-C-LABEL: define dso_local <4 x i32> @test__Uint32x4_t( +// CHECK-C-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <4 x i32>, align 16 +// CHECK-C-NEXT: store <4 x i32> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <4 x i32> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <4 x i32> @_Z16test__Uint32x4_t12__Uint32x4_t( +// CHECK-CPP-SAME: <4 x i32> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <4 x i32>, align 16 +// CHECK-CPP-NEXT: store <4 x i32> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <4 x i32>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test__Uint32x4_t(__Uint32x4_t x) { return x; } +// CHECK-C-LABEL: define dso_local <2 x i64> @test__Uint64x2_t( +// CHECK-C-SAME: <2 x i64> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-C-NEXT: store <2 x i64> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <2 x i64> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <2 x i64> @_Z16test__Uint64x2_t12__Uint64x2_t( +// CHECK-CPP-SAME: <2 x i64> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-CPP-NEXT: store <2 x i64> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <2 x i64> [[TMP0]] +// +uint64x2_t test__Uint64x2_t(__Uint64x2_t x) { return x; } +// CHECK-C-LABEL: define dso_local <8 x half> @test__Float16x8_t( +// CHECK-C-SAME: <8 x half> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <8 x half>, align 16 +// CHECK-C-NEXT: store <8 x half> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <8 x half> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <8 x half> @_Z17test__Float16x8_t13__Float16x8_t( +// CHECK-CPP-SAME: <8 x half> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <8 x half>, align 16 +// CHECK-CPP-NEXT: store <8 x half> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <8 x half>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <8 x half> [[TMP0]] +// +float16x8_t test__Float16x8_t(__Float16x8_t x) { return x; } +// CHECK-C-LABEL: define dso_local <4 x float> @test__Float32x4_t( +// CHECK-C-SAME: <4 x float> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-C-NEXT: store <4 x float> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <4 x float> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <4 x float> @_Z17test__Float32x4_t13__Float32x4_t( +// CHECK-CPP-SAME: <4 x float> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <4 x float>, align 16 +// CHECK-CPP-NEXT: store <4 x float> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <4 x float> [[TMP0]] +// +float32x4_t test__Float32x4_t(__Float32x4_t x) { return x; } +// CHECK-C-LABEL: define dso_local <2 x double> @test__Float64x2_t( +// CHECK-C-SAME: <2 x double> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-C-NEXT: store <2 x double> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <2 x double> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <2 x double> @_Z17test__Float64x2_t13__Float64x2_t( +// CHECK-CPP-SAME: <2 x double> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <2 x double>, align 16 +// CHECK-CPP-NEXT: store <2 x double> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <2 x double>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <2 x double> [[TMP0]] +// +float64x2_t test__Float64x2_t(__Float64x2_t x) { return x; } +// CHECK-C-LABEL: define dso_local <16 x i8> @test__Poly8x16_t( +// CHECK-C-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-C-NEXT: store <16 x i8> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <16 x i8> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <16 x i8> @_Z16test__Poly8x16_t12__Poly8x16_t( +// CHECK-CPP-SAME: <16 x i8> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-CPP-NEXT: store <16 x i8> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <16 x i8> [[TMP0]] +// +poly8x16_t test__Poly8x16_t(__Poly8x16_t x) { return x; } +// CHECK-C-LABEL: define dso_local <8 x i16> @test__Poly16x8_t( +// CHECK-C-SAME: <8 x i16> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <8 x i16>, align 16 +// CHECK-C-NEXT: store <8 x i16> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <8 x i16> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <8 x i16> @_Z16test__Poly16x8_t12__Poly16x8_t( +// CHECK-CPP-SAME: <8 x i16> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <8 x i16>, align 16 +// CHECK-CPP-NEXT: store <8 x i16> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <8 x i16>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <8 x i16> [[TMP0]] +// +poly16x8_t test__Poly16x8_t(__Poly16x8_t x) { return x; } +// CHECK-C-LABEL: define dso_local <2 x i64> @test__Poly64x2_t( +// CHECK-C-SAME: <2 x i64> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-C-NEXT: store <2 x i64> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <2 x i64> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <2 x i64> @_Z16test__Poly64x2_t12__Poly64x2_t( +// CHECK-CPP-SAME: <2 x i64> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <2 x i64>, align 16 +// CHECK-CPP-NEXT: store <2 x i64> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <2 x i64>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <2 x i64> [[TMP0]] +// +poly64x2_t test__Poly64x2_t(__Poly64x2_t x) { return x; } +// CHECK-C-LABEL: define dso_local <8 x bfloat> @test__Bfloat16x8_t( +// CHECK-C-SAME: <8 x bfloat> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <8 x bfloat>, align 16 +// CHECK-C-NEXT: store <8 x bfloat> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <8 x bfloat>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <8 x bfloat> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local noundef <8 x bfloat> @_Z18test__Bfloat16x8_t14__Bfloat16x8_t( +// CHECK-CPP-SAME: <8 x bfloat> noundef [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <8 x bfloat>, align 16 +// CHECK-CPP-NEXT: store <8 x bfloat> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <8 x bfloat>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <8 x bfloat> [[TMP0]] +// +bfloat16x8_t test__Bfloat16x8_t(__Bfloat16x8_t x) { return x; } +// CHECK-C-LABEL: define dso_local <8 x i8> @test__Mfloat8x8_t( +// CHECK-C-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-C-NEXT: store <8 x i8> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8 +// CHECK-C-NEXT: ret <8 x i8> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local <8 x i8> @_Z17test__Mfloat8x8_t13__Mfloat8x8_t( +// CHECK-CPP-SAME: <8 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <8 x i8>, align 8 +// CHECK-CPP-NEXT: store <8 x i8> [[X]], ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <8 x i8>, ptr [[X_ADDR]], align 8 +// CHECK-CPP-NEXT: ret <8 x i8> [[TMP0]] +// +mfloat8x8_t test__Mfloat8x8_t(__Mfloat8x8_t x) { return x; } +// CHECK-C-LABEL: define dso_local <16 x i8> @test__Mfloat8x16_t( +// CHECK-C-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-C-NEXT: [[ENTRY:.*:]] +// CHECK-C-NEXT: [[X_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-C-NEXT: store <16 x i8> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16 +// CHECK-C-NEXT: ret <16 x i8> [[TMP0]] +// +// CHECK-CPP-LABEL: define dso_local <16 x i8> @_Z18test__Mfloat8x16_t14__Mfloat8x16_t( +// CHECK-CPP-SAME: <16 x i8> [[X:%.*]]) #[[ATTR0]] { +// CHECK-CPP-NEXT: [[ENTRY:.*:]] +// CHECK-CPP-NEXT: [[X_ADDR:%.*]] = alloca <16 x i8>, align 16 +// CHECK-CPP-NEXT: store <16 x i8> [[X]], ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: [[TMP0:%.*]] = load <16 x i8>, ptr [[X_ADDR]], align 16 +// CHECK-CPP-NEXT: ret <16 x i8> [[TMP0]] +// +mfloat8x16_t test__Mfloat8x16_t(__Mfloat8x16_t x) { return x; } From 6e0c2bc668107547365d79a6e5f57317a6302c29 Mon Sep 17 00:00:00 2001 From: Javed Absar Date: Wed, 11 Jun 2025 10:05:34 +0100 Subject: [PATCH 048/851] [mlir][async][nfc] Fix typo in async op description (#143621) --- mlir/include/mlir/Dialect/Async/IR/AsyncOps.td | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td b/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td index 3d29d5bc7dbb6..6dbcdefbc9332 100644 --- a/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td +++ b/mlir/include/mlir/Dialect/Async/IR/AsyncOps.td @@ -376,7 +376,7 @@ def Async_CreateGroupOp : Async_Op<"create_group", [Pure]> { } def Async_AddToGroupOp : Async_Op<"add_to_group", []> { - let summary = "adds and async token or value to the group"; + let summary = "adds an async token or value to the group"; let description = [{ The `async.add_to_group` adds an async token or value to the async group. Returns the rank of the added element in the group. This rank is fixed @@ -655,7 +655,7 @@ def Async_RuntimeLoadOp : Async_Op<"runtime.load", } def Async_RuntimeAddToGroupOp : Async_Op<"runtime.add_to_group", []> { - let summary = "adds and async token or value to the group"; + let summary = "adds an async token or value to the group"; let description = [{ The `async.runtime.add_to_group` adds an async token or value to the async group. Returns the rank of the added element in the group. From 7ffdf4240d62724dca7f42b37bd8671fefe17e17 Mon Sep 17 00:00:00 2001 From: Tom Eccles Date: Wed, 11 Jun 2025 10:21:07 +0100 Subject: [PATCH 049/851] [flang][Driver] Enable support for -mmacos-version-min= (#143508) So far as I can tell this option is driver-only so we can just re-use what already exists for clang. I've added a unit test based on clang's unit test to demonstrate that the option is handled. Still TODO is to ensure that flang-rt is built with the same macos minimum version as compiler-rt. At the moment, setting the flang minimum version to older than the macos version on which flang was built will lead to link warnings because flangrt is built for version of macos on which flang was built rather than the oldest supported version (as compiler-rt is). --- clang/include/clang/Driver/Options.td | 2 + flang/test/Driver/darwin-version.f90 | 107 ++++++++++++++++++++++++++ 2 files changed, 109 insertions(+) create mode 100644 flang/test/Driver/darwin-version.f90 diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 3582efd7721b0..152df89118a6a 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -4927,8 +4927,10 @@ def ffuchsia_api_level_EQ : Joined<["-"], "ffuchsia-api-level=">, HelpText<"Set Fuchsia API level">, MarshallingInfoInt>; def mmacos_version_min_EQ : Joined<["-"], "mmacos-version-min=">, + Visibility<[ClangOption, CC1Option, FlangOption]>, Group, HelpText<"Set macOS deployment target">; def : Joined<["-"], "mmacosx-version-min=">, + Visibility<[ClangOption, CC1Option, FC1Option, FlangOption]>, Group, Alias; def mms_bitfields : Flag<["-"], "mms-bitfields">, Group, Visibility<[ClangOption, CC1Option]>, diff --git a/flang/test/Driver/darwin-version.f90 b/flang/test/Driver/darwin-version.f90 new file mode 100644 index 0000000000000..99d19ee44be9b --- /dev/null +++ b/flang/test/Driver/darwin-version.f90 @@ -0,0 +1,107 @@ +! Based on clang's darwin-version.c test with tests for ios watchos and tvos +! removed + +! RUN: %flang -target i686-apple-darwin8 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-OSX4 %s +! RUN: %flang -target i686-apple-darwin9 -mmacos-version-min=10.4 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-OSX4 %s +! CHECK-VERSION-OSX4: "i686-apple-macosx10.4.0" +! RUN: %flang -target i686-apple-darwin9 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-OSX5 %s +! RUN: %flang -target i686-apple-darwin9 -mmacos-version-min=10.5 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-OSX5 %s +! CHECK-VERSION-OSX5: "i686-apple-macosx10.5.0" +! RUN: %flang -target i686-apple-darwin10 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-OSX6 %s +! RUN: %flang -target i686-apple-darwin9 -mmacos-version-min=10.6 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-OSX6 %s +! CHECK-VERSION-OSX6: "i686-apple-macosx10.6.0" +! RUN: %flang -target x86_64-apple-darwin14 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-OSX10 %s +! RUN: %flang -target x86_64-apple-darwin -mmacos-version-min=10.10 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-OSX10 %s +! RUN: %flang -target x86_64-apple-darwin -mmacos-version-min=10.10 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-OSX10 %s +! CHECK-VERSION-OSX10: "x86_64-apple-macosx10.10.0" +! RUN: not %flang -target x86_64-apple-darwin -mmacos-version-min= -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-MISSING %s +! RUN: not %flang -target x86_64-apple-darwin -mmacos-version-min= -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-MISSING %s +! CHECK-VERSION-MISSING: missing version number + +! RUN: %flang -target x86_64-apple-driverkit19.0 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-DRIVERKIT190 %s +! CHECK-VERSION-DRIVERKIT190: "x86_64-apple-driverkit19.0.0" + +! Check environment variable gets interpreted correctly +! RUN: env MACOSX_DEPLOYMENT_TARGET=10.5 IPHONEOS_DEPLOYMENT_TARGET=2.0 \ +! RUN: %flang -target i686-apple-darwin9 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-OSX5 %s + +! RUN: env MACOSX_DEPLOYMENT_TARGET=10.4.10 \ +! RUN: %flang -target i386-apple-darwin9 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-OSX49 %s +! CHECK-VERSION-OSX49: "i386-apple-macosx10.4.10" +! RUN: env IPHONEOS_DEPLOYMENT_TARGET=2.3.1 \ + +! Target can specify the OS version: + +! RUN: %flang -target x86_64-apple-macos10.11.2 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-TMAC2 %s +! CHECK-VERSION-TMAC2: "x86_64-apple-macosx10.11.2" + +! Warn about -m-version-min when it's used with target: + +! RUN: %flang -target x86_64-apple-macos10.11.2 -mmacos-version-min=10.6 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-TNO-OSV1 %s +! CHECK-VERSION-TNO-OSV1: overriding '-mmacos-version-min=10.6' option with '-target x86_64-apple-macos10.11.2' + +! RUN: %flang -target x86_64-apple-macos10.6 -mmacos-version-min=10.6 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-TNO-SAME %s +! CHECK-VERSION-TNO-SAME-NOT: overriding +! CHECK-VERSION-TNO-SAME-NOT: argument unused during compilation + +! Target with OS version is not overridden by -m-version-min variables: + +! RUN: %flang -target x86_64-apple-macos10.11.2 -mmacos-version-min=10.6 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-TIGNORE-OSV1 %s +! CHECK-VERSION-TIGNORE-OSV1: "x86_64-apple-macosx10.11.2" + +! Target without OS version includes the OS given by -m-version-min arguments: + +! RUN: %flang -target x86_64-apple-macos -mmacos-version-min=10.11 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-USE-OS-ARG1 %s +! CHECK-VERSION-USE-OS-ARG1: "x86_64-apple-macosx10.11.0" + +! Target with OS version is not overridden by environment variables: + +! RUN: env MACOSX_DEPLOYMENT_TARGET=10.1 \ +! RUN: %flang -target i386-apple-macos10.5 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-TMACOS-CMD %s +! CHECK-VERSION-TMACOS-CMD: "i386-apple-macosx10.5.0" + +! Target with OS version is not overridden by arch: + +! RUN: %flang -target uknown-apple-macos10.11.2 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-VERSION-TIGNORE-ARCH1 %s +! CHECK-VERSION-TIGNORE-ARCH1: "unknown-apple-macosx10.11.2" + +! Target can be used to specify the environment: + +! RUN: %flang -target x86_64-apple-macos11 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-MACOS11 %s +! RUN: %flang -target x86_64-apple-darwin20 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-MACOS11 %s +! RUN: %flang -target x86_64-apple-darwin -mmacos-version-min=11 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-MACOS11 %s +! CHECK-MACOS11: "x86_64-apple-macosx11.0.0" + +! RUN: %flang -target arm64-apple-macosx10.16 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-IMPLICIT-MACOS11 %s +! CHECK-IMPLICIT-MACOS11: warning: overriding deployment version +! CHECK-IMPLICIT-MACOS11: "arm64-apple-macosx11.0.0" + +! RUN: %flang -target arm64-apple-macos999 -c %s -### 2>&1 | \ +! RUN: FileCheck --check-prefix=CHECK-MACOS999 %s + +! CHECK-MACOS999: "arm64-apple-macosx999.0.0" From 9797b5fcfbb9b9c96a219985f3623849bbd3956e Mon Sep 17 00:00:00 2001 From: Dmitry Polukhin <34227995+dmpolukhin@users.noreply.github.com> Date: Wed, 11 Jun 2025 10:35:06 +0100 Subject: [PATCH 050/851] [C++20][Modules] Fix false compilation error with constexpr (#143168) Use declaresSameEntity when evaluating constexpr to avoid resetting computed union value due to using different instances of the merged field decl. --- clang/docs/ReleaseNotes.rst | 1 + clang/lib/AST/ExprConstant.cpp | 3 +- .../constexpr-initialization-failure.cpp | 44 +++++++++++++++++++ 3 files changed, 47 insertions(+), 1 deletion(-) create mode 100644 clang/test/Modules/constexpr-initialization-failure.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 5645edc73431b..b5e6cf088a4b1 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -693,6 +693,7 @@ Bug Fixes in This Version - Fixed type mismatch error when 'builtin-elementwise-math' arguments have different qualifiers, this should be well-formed. (#GH141397) - Constant evaluation now correctly runs the destructor of a variable declared in the second clause of a C-style ``for`` loop. (#GH139818) +- Fixed a bug with constexpr evaluation for structs containing unions in case of C++ modules. (#GH143168) Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp index fa4e10e84de05..27ea55e981446 100644 --- a/clang/lib/AST/ExprConstant.cpp +++ b/clang/lib/AST/ExprConstant.cpp @@ -6844,7 +6844,8 @@ static bool HandleConstructorCall(const Expr *E, const LValue &This, // FIXME: In this case, the values of the other subobjects are // specified, since zero-initialization sets all padding bits to zero. if (!Value->hasValue() || - (Value->isUnion() && Value->getUnionField() != FD)) { + (Value->isUnion() && + !declaresSameEntity(Value->getUnionField(), FD))) { if (CD->isUnion()) *Value = APValue(FD); else diff --git a/clang/test/Modules/constexpr-initialization-failure.cpp b/clang/test/Modules/constexpr-initialization-failure.cpp new file mode 100644 index 0000000000000..8ff20f2fc8ac6 --- /dev/null +++ b/clang/test/Modules/constexpr-initialization-failure.cpp @@ -0,0 +1,44 @@ +// RUN: rm -fR %t +// RUN: split-file %s %t +// RUN: cd %t +// RUN: %clang_cc1 -verify -w -std=c++20 -fmodule-name=h1.h -emit-header-unit -xc++-user-header h1.h -o h1.pcm +// RUN: %clang_cc1 -verify -w -std=c++20 -fmodule-map-file=module.modulemap -fmodule-file=h1.h=h1.pcm main.cpp -o main.o + +//--- module.modulemap +module "h1.h" { + header "h1.h" + export * +} + +//--- h0.h +// expected-no-diagnostics +#pragma once + +template struct A { + union { + struct { + T x, y, z; + }; + }; + constexpr A(T, T, T) : x(), y(), z() {} +}; +typedef A packed_vec3; + +//--- h1.h +// expected-no-diagnostics +#pragma once + +#include "h0.h" + +constexpr packed_vec3 kMessThingsUp = packed_vec3(5.0f, 5.0f, 5.0f); + +//--- main.cpp +// expected-no-diagnostics +#include "h0.h" + +static_assert(sizeof(packed_vec3) == sizeof(float) * 3); +static_assert(alignof(packed_vec3) == sizeof(float)); + +import "h1.h"; + +constexpr packed_vec3 kDefaultHalfExtents = packed_vec3(5.0f, 5.0f, 5.0f); From c59cc2b690b9e528a82ba214f74a8f7c8abb3cde Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 11 Jun 2025 11:43:34 +0200 Subject: [PATCH 051/851] [libunwind] Remove checks for -nostdlib++ (#143162) libunwind uses a C linker, so it's never even trying to link against any C++ libraries. This removes the code which tries to drop C++ libraries, which makes the CMake configuration simpler and allows for upgrading GCC. --- libunwind/cmake/config-ix.cmake | 56 --------------------------------- libunwind/src/CMakeLists.txt | 12 ------- 2 files changed, 68 deletions(-) diff --git a/libunwind/cmake/config-ix.cmake b/libunwind/cmake/config-ix.cmake index 126c872f0d489..d42ceffb1f631 100644 --- a/libunwind/cmake/config-ix.cmake +++ b/libunwind/cmake/config-ix.cmake @@ -26,62 +26,6 @@ if (NOT LIBUNWIND_USE_COMPILER_RT) endif () endif() -# libunwind is using -nostdlib++ at the link step when available, -# otherwise -nodefaultlibs is used. We want all our checks to also -# use one of these options, otherwise we may end up with an inconsistency between -# the flags we think we require during configuration (if the checks are -# performed without one of those options) and the flags that are actually -# required during compilation (which has the -nostdlib++ or -nodefaultlibs). libc is -# required for the link to go through. We remove sanitizers from the -# configuration checks to avoid spurious link errors. - -llvm_check_compiler_linker_flag(CXX "-nostdlib++" CXX_SUPPORTS_NOSTDLIBXX_FLAG) -if (CXX_SUPPORTS_NOSTDLIBXX_FLAG) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nostdlib++") -else() - llvm_check_compiler_linker_flag(C "-nodefaultlibs" C_SUPPORTS_NODEFAULTLIBS_FLAG) - if (C_SUPPORTS_NODEFAULTLIBS_FLAG) - set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -nodefaultlibs") - endif() -endif() - -# Only link against compiler-rt manually if we use -nodefaultlibs, since -# otherwise the compiler will do the right thing on its own. -if (NOT CXX_SUPPORTS_NOSTDLIBXX_FLAG AND C_SUPPORTS_NODEFAULTLIBS_FLAG) - if (LIBUNWIND_HAS_C_LIB) - list(APPEND CMAKE_REQUIRED_LIBRARIES c) - endif () - if (LIBUNWIND_HAS_ROOT_LIB) - list(APPEND CMAKE_REQUIRED_LIBRARIES root) - endif () - if (LIBUNWIND_USE_COMPILER_RT) - include(HandleCompilerRT) - find_compiler_rt_library(builtins LIBUNWIND_BUILTINS_LIBRARY - FLAGS ${LIBUNWIND_COMPILE_FLAGS}) - list(APPEND CMAKE_REQUIRED_LIBRARIES "${LIBUNWIND_BUILTINS_LIBRARY}") - else () - if (LIBUNWIND_HAS_GCC_S_LIB) - list(APPEND CMAKE_REQUIRED_LIBRARIES gcc_s) - endif () - if (LIBUNWIND_HAS_GCC_LIB) - list(APPEND CMAKE_REQUIRED_LIBRARIES gcc) - endif () - endif () - if (MINGW) - # Mingw64 requires quite a few "C" runtime libraries in order for basic - # programs to link successfully with -nodefaultlibs. - if (LIBUNWIND_USE_COMPILER_RT) - set(MINGW_RUNTIME ${LIBUNWIND_BUILTINS_LIBRARY}) - else () - set(MINGW_RUNTIME gcc_s gcc) - endif() - set(MINGW_LIBRARIES mingw32 ${MINGW_RUNTIME} moldname mingwex msvcrt advapi32 - shell32 user32 kernel32 mingw32 ${MINGW_RUNTIME} - moldname mingwex msvcrt) - list(APPEND CMAKE_REQUIRED_LIBRARIES ${MINGW_LIBRARIES}) - endif() -endif() - if (CXX_SUPPORTS_NOSTDLIBXX_FLAG OR C_SUPPORTS_NODEFAULTLIBS_FLAG) if (CMAKE_C_FLAGS MATCHES -fsanitize OR CMAKE_CXX_FLAGS MATCHES -fsanitize) set(CMAKE_REQUIRED_FLAGS "${CMAKE_REQUIRED_FLAGS} -fno-sanitize=all") diff --git a/libunwind/src/CMakeLists.txt b/libunwind/src/CMakeLists.txt index 70bd3a017cda7..03818b1bb2512 100644 --- a/libunwind/src/CMakeLists.txt +++ b/libunwind/src/CMakeLists.txt @@ -71,18 +71,6 @@ set(LIBUNWIND_SOURCES ${LIBUNWIND_ASM_SOURCES}) # Generate library list. -if (CXX_SUPPORTS_NOSTDLIBXX_FLAG) - add_link_flags_if_supported(-nostdlib++) -else() - if (LIBUNWIND_USE_COMPILER_RT) - add_library_flags("${LIBUNWIND_BUILTINS_LIBRARY}") - else() - add_library_flags_if(LIBUNWIND_HAS_GCC_S_LIB gcc_s) - add_library_flags_if(LIBUNWIND_HAS_GCC_LIB gcc) - endif() - add_library_flags_if(LIBUNWIND_HAS_C_LIB c) -endif() - if (NOT APPLE) add_library_flags_if(LIBUNWIND_HAS_DL_LIB dl) endif() From ea9046699eae04ac5159a1666f19b5b32e5d41c1 Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Wed, 11 Jun 2025 11:02:32 +0100 Subject: [PATCH 052/851] [LLVM][SROA] Teach SROA how to "bitcast" between fixed and scalable vectors. (#130973) For function whose vscale_range is limited to a single value we can size scalable vectors. This aids SROA by allowing scalable vector load and store operations to be considered for replacement whereby bitcasts through memory can be replaced by vector insert or extract operations. --- .../CodeGen/attr-arm-sve-vector-bits-cast.c | 23 +- llvm/include/llvm/IR/Function.h | 4 + llvm/lib/IR/Function.cpp | 12 + llvm/lib/Transforms/Scalar/SROA.cpp | 165 +++++++-- .../scalable-vectors-with-known-vscale.ll | 349 ++++++++++++++++++ llvm/test/Transforms/SROA/scalable-vectors.ll | 223 ++++++++++- 6 files changed, 721 insertions(+), 55 deletions(-) create mode 100644 llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c index e1e2220f94d6d..fcd4314249ff8 100644 --- a/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c +++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-cast.c @@ -62,10 +62,7 @@ fixed_bool_t from_svbool_t(svbool_t type) { // CHECK-LABEL: @lax_cast( // CHECK-NEXT: entry: -// CHECK-NEXT: [[SAVED_VALUE:%.*]] = alloca <16 x i32>, align 64 -// CHECK-NEXT: [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE_COERCE:%.*]], i64 0) -// CHECK-NEXT: store <16 x i32> [[TYPE]], ptr [[SAVED_VALUE]], align 64, !tbaa [[TBAA6:![0-9]+]] -// CHECK-NEXT: [[TMP0:%.*]] = load , ptr [[SAVED_VALUE]], align 64, !tbaa [[TBAA6]] +// CHECK-NEXT: [[TMP0:%.*]] = bitcast [[TYPE_COERCE:%.*]] to // CHECK-NEXT: ret [[TMP0]] // svint64_t lax_cast(fixed_int32_t type) { @@ -74,9 +71,9 @@ svint64_t lax_cast(fixed_int32_t type) { // CHECK-LABEL: @to_svint32_t__from_gnu_int32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA6]] -// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) -// CHECK-NEXT: ret [[CASTSCALABLESVE]] +// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA2:![0-9]+]] +// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) +// CHECK-NEXT: ret [[CAST_SCALABLE]] // svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) { return type; @@ -84,8 +81,8 @@ svint32_t to_svint32_t__from_gnu_int32_t(gnu_int32_t type) { // CHECK-LABEL: @from_svint32_t__to_gnu_int32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[CASTFIXEDSVE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE:%.*]], i64 0) -// CHECK-NEXT: store <16 x i32> [[CASTFIXEDSVE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA6]] +// CHECK-NEXT: [[CAST_FIXED:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE:%.*]], i64 0) +// CHECK-NEXT: store <16 x i32> [[CAST_FIXED]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA2]] // CHECK-NEXT: ret void // gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) { @@ -94,9 +91,9 @@ gnu_int32_t from_svint32_t__to_gnu_int32_t(svint32_t type) { // CHECK-LABEL: @to_fixed_int32_t__from_gnu_int32_t( // CHECK-NEXT: entry: -// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA6]] -// CHECK-NEXT: [[CASTSCALABLESVE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) -// CHECK-NEXT: ret [[CASTSCALABLESVE]] +// CHECK-NEXT: [[TYPE:%.*]] = load <16 x i32>, ptr [[TMP0:%.*]], align 16, !tbaa [[TBAA2]] +// CHECK-NEXT: [[CAST_SCALABLE:%.*]] = tail call @llvm.vector.insert.nxv4i32.v16i32( poison, <16 x i32> [[TYPE]], i64 0) +// CHECK-NEXT: ret [[CAST_SCALABLE]] // fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) { return type; @@ -105,7 +102,7 @@ fixed_int32_t to_fixed_int32_t__from_gnu_int32_t(gnu_int32_t type) { // CHECK-LABEL: @from_fixed_int32_t__to_gnu_int32_t( // CHECK-NEXT: entry: // CHECK-NEXT: [[TYPE:%.*]] = tail call <16 x i32> @llvm.vector.extract.v16i32.nxv4i32( [[TYPE_COERCE:%.*]], i64 0) -// CHECK-NEXT: store <16 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA6]] +// CHECK-NEXT: store <16 x i32> [[TYPE]], ptr [[AGG_RESULT:%.*]], align 16, !tbaa [[TBAA2]] // CHECK-NEXT: ret void // gnu_int32_t from_fixed_int32_t__to_gnu_int32_t(fixed_int32_t type) { diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h index c2510ea75544a..f24d03635731e 100644 --- a/llvm/include/llvm/IR/Function.h +++ b/llvm/include/llvm/IR/Function.h @@ -1053,6 +1053,10 @@ class LLVM_ABI Function : public GlobalObject, public ilist_node { /// defined. void setAlignment(MaybeAlign Align) { GlobalObject::setAlignment(Align); } + /// Return the value for vscale based on the vscale_range attribute or 0 when + /// unknown. + unsigned getVScaleValue() const; + private: void allocHungoffUselist(); template void setHungoffOperand(Constant *C); diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp index 63665d837c398..493dec72d45af 100644 --- a/llvm/lib/IR/Function.cpp +++ b/llvm/lib/IR/Function.cpp @@ -1165,6 +1165,18 @@ bool Function::nullPointerIsDefined() const { return hasFnAttribute(Attribute::NullPointerIsValid); } +unsigned Function::getVScaleValue() const { + Attribute Attr = getFnAttribute(Attribute::VScaleRange); + if (!Attr.isValid()) + return 0; + + unsigned VScale = Attr.getVScaleRangeMin(); + if (VScale && VScale == Attr.getVScaleRangeMax()) + return VScale; + + return 0; +} + bool llvm::NullPointerIsDefined(const Function *F, unsigned AS) { if (F && F->nullPointerIsDefined()) return true; diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp index a4e373d395b90..42d1d9a437bb2 100644 --- a/llvm/lib/Transforms/Scalar/SROA.cpp +++ b/llvm/lib/Transforms/Scalar/SROA.cpp @@ -1120,8 +1120,13 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor { return PI.setEscapedReadOnly(&LI); TypeSize Size = DL.getTypeStoreSize(LI.getType()); - if (Size.isScalable()) - return PI.setAborted(&LI); + if (Size.isScalable()) { + unsigned VScale = LI.getFunction()->getVScaleValue(); + if (!VScale) + return PI.setAborted(&LI); + + Size = TypeSize::getFixed(Size.getKnownMinValue() * VScale); + } return handleLoadOrStore(LI.getType(), LI, Offset, Size.getFixedValue(), LI.isVolatile()); @@ -1135,8 +1140,13 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor { return PI.setAborted(&SI); TypeSize StoreSize = DL.getTypeStoreSize(ValOp->getType()); - if (StoreSize.isScalable()) - return PI.setAborted(&SI); + if (StoreSize.isScalable()) { + unsigned VScale = SI.getFunction()->getVScaleValue(); + if (!VScale) + return PI.setAborted(&SI); + + StoreSize = TypeSize::getFixed(StoreSize.getKnownMinValue() * VScale); + } uint64_t Size = StoreSize.getFixedValue(); @@ -1927,7 +1937,8 @@ static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) { /// ensure that we only try to convert viable values. The strategy is that we /// will peel off single element struct and array wrappings to get to an /// underlying value, and convert that value. -static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { +static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy, + unsigned VScale = 0) { if (OldTy == NewTy) return true; @@ -1941,8 +1952,35 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { return false; } - if (DL.getTypeSizeInBits(NewTy).getFixedValue() != - DL.getTypeSizeInBits(OldTy).getFixedValue()) + TypeSize NewSize = DL.getTypeSizeInBits(NewTy); + TypeSize OldSize = DL.getTypeSizeInBits(OldTy); + + if ((isa(NewTy) && isa(OldTy)) || + (isa(OldTy) && isa(NewTy))) { + // Conversion is only possible when the size of scalable vectors is known. + if (!VScale) + return false; + + // For ptr-to-int and int-to-ptr casts, the pointer side is resolved within + // a single domain (either fixed or scalable). Any additional conversion + // between fixed and scalable types is handled through integer types. + auto OldVTy = OldTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(OldTy) : OldTy; + auto NewVTy = NewTy->isPtrOrPtrVectorTy() ? DL.getIntPtrType(NewTy) : NewTy; + + if (isa(NewTy)) { + if (!VectorType::getWithSizeAndScalar(cast(NewVTy), OldVTy)) + return false; + + NewSize = TypeSize::getFixed(NewSize.getKnownMinValue() * VScale); + } else { + if (!VectorType::getWithSizeAndScalar(cast(OldVTy), NewVTy)) + return false; + + OldSize = TypeSize::getFixed(OldSize.getKnownMinValue() * VScale); + } + } + + if (NewSize != OldSize) return false; if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType()) return false; @@ -1992,7 +2030,14 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) { static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, Type *NewTy) { Type *OldTy = V->getType(); - assert(canConvertValue(DL, OldTy, NewTy) && "Value not convertable to type"); + +#ifndef NDEBUG + BasicBlock *BB = IRB.GetInsertBlock(); + assert(BB && BB->getParent() && "VScale unknown!"); + unsigned VScale = BB->getParent()->getVScaleValue(); + assert(canConvertValue(DL, OldTy, NewTy, VScale) && + "Value not convertable to type"); +#endif if (OldTy == NewTy) return V; @@ -2000,13 +2045,41 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, assert(!(isa(OldTy) && isa(NewTy)) && "Integer types must be the exact same to convert."); + // A variant of bitcast that supports a mixture of fixed and scalable types + // that are know to have the same size. + auto CreateBitCastLike = [&IRB](Value *In, Type *Ty) -> Value * { + Type *InTy = In->getType(); + if (InTy == Ty) + return In; + + if (isa(InTy) && isa(Ty)) { + // For vscale_range(2) expand <4 x i32> to --> + // <4 x i32> to to + auto *VTy = VectorType::getWithSizeAndScalar(cast(Ty), InTy); + return IRB.CreateBitCast(IRB.CreateInsertVector(VTy, + PoisonValue::get(VTy), In, + IRB.getInt64(0)), + Ty); + } + + if (isa(InTy) && isa(Ty)) { + // For vscale_range(2) expand to <4 x i32> --> + // to to <4 x i32> + auto *VTy = VectorType::getWithSizeAndScalar(cast(InTy), Ty); + return IRB.CreateExtractVector(Ty, IRB.CreateBitCast(In, VTy), + IRB.getInt64(0)); + } + + return IRB.CreateBitCast(In, Ty); + }; + // See if we need inttoptr for this type pair. May require additional bitcast. if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) { // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8* // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*> // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*> // Directly handle i64 to i8* - return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)), + return IRB.CreateIntToPtr(CreateBitCastLike(V, DL.getIntPtrType(NewTy)), NewTy); } @@ -2016,7 +2089,7 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32> // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32> // Expand i8* to i64 --> i8* to i64 to i64 - return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), + return CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), NewTy); } @@ -2031,12 +2104,14 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, // size. if (OldAS != NewAS) { assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS)); - return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), - NewTy); + return IRB.CreateIntToPtr( + CreateBitCastLike(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)), + DL.getIntPtrType(NewTy)), + NewTy); } } - return IRB.CreateBitCast(V, NewTy); + return CreateBitCastLike(V, NewTy); } /// Test whether the given slice use can be promoted to a vector. @@ -2046,7 +2121,8 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V, static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, VectorType *Ty, uint64_t ElementSize, - const DataLayout &DL) { + const DataLayout &DL, + unsigned VScale) { // First validate the slice offsets. uint64_t BeginOffset = std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset(); @@ -2090,7 +2166,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, assert(LTy->isIntegerTy()); LTy = SplitIntTy; } - if (!canConvertValue(DL, SliceTy, LTy)) + if (!canConvertValue(DL, SliceTy, LTy, VScale)) return false; } else if (StoreInst *SI = dyn_cast(U->getUser())) { if (SI->isVolatile()) @@ -2103,7 +2179,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, assert(STy->isIntegerTy()); STy = SplitIntTy; } - if (!canConvertValue(DL, STy, SliceTy)) + if (!canConvertValue(DL, STy, SliceTy, VScale)) return false; } else { return false; @@ -2118,7 +2194,7 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S, /// (and thus isVectorPromotionViable) over all slices of the alloca for the /// given VectorType. static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, - const DataLayout &DL) { + const DataLayout &DL, unsigned VScale) { uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType()).getFixedValue(); @@ -2131,11 +2207,11 @@ static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy, ElementSize /= 8; for (const Slice &S : P) - if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL)) + if (!isVectorPromotionViableForSlice(P, S, VTy, ElementSize, DL, VScale)) return false; for (const Slice *S : P.splitSliceTails()) - if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL)) + if (!isVectorPromotionViableForSlice(P, *S, VTy, ElementSize, DL, VScale)) return false; return true; @@ -2150,7 +2226,7 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, SmallVectorImpl &CandidateTys, bool HaveCommonEltTy, Type *CommonEltTy, bool HaveVecPtrTy, bool HaveCommonVecPtrTy, - VectorType *CommonVecPtrTy) { + VectorType *CommonVecPtrTy, unsigned VScale) { // If we didn't find a vector type, nothing to do here. if (CandidateTys.empty()) return nullptr; @@ -2226,7 +2302,7 @@ checkVectorTypesForPromotion(Partition &P, const DataLayout &DL, }); for (VectorType *VTy : CandidateTys) - if (checkVectorTypeForPromotion(P, VTy, DL)) + if (checkVectorTypeForPromotion(P, VTy, DL, VScale)) return VTy; return nullptr; @@ -2237,7 +2313,7 @@ static VectorType *createAndCheckVectorTypesForPromotion( function_ref CheckCandidateType, Partition &P, const DataLayout &DL, SmallVectorImpl &CandidateTys, bool &HaveCommonEltTy, Type *&CommonEltTy, bool &HaveVecPtrTy, - bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy) { + bool &HaveCommonVecPtrTy, VectorType *&CommonVecPtrTy, unsigned VScale) { [[maybe_unused]] VectorType *OriginalElt = CandidateTysCopy.size() ? CandidateTysCopy[0] : nullptr; // Consider additional vector types where the element type size is a @@ -2262,9 +2338,9 @@ static VectorType *createAndCheckVectorTypesForPromotion( } } - return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy, - CommonEltTy, HaveVecPtrTy, - HaveCommonVecPtrTy, CommonVecPtrTy); + return checkVectorTypesForPromotion( + P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, + HaveCommonVecPtrTy, CommonVecPtrTy, VScale); } /// Test whether the given alloca partitioning and range of slices can be @@ -2276,7 +2352,8 @@ static VectorType *createAndCheckVectorTypesForPromotion( /// SSA value. We only can ensure this for a limited set of operations, and we /// don't want to do the rewrites unless we are confident that the result will /// be promotable, so we have an early test here. -static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { +static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL, + unsigned VScale) { // Collect the candidate types for vector-based promotion. Also track whether // we have different element types. SmallVector CandidateTys; @@ -2288,7 +2365,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { bool HaveCommonEltTy = true; bool HaveCommonVecPtrTy = true; auto CheckCandidateType = [&](Type *Ty) { - if (auto *VTy = dyn_cast(Ty)) { + if (auto *VTy = dyn_cast(Ty)) { // Return if bitcast to vectors is different for total size in bits. if (!CandidateTys.empty()) { VectorType *V = CandidateTys[0]; @@ -2343,14 +2420,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) { if (auto *VTy = createAndCheckVectorTypesForPromotion( LoadStoreTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, - HaveCommonVecPtrTy, CommonVecPtrTy)) + HaveCommonVecPtrTy, CommonVecPtrTy, VScale)) return VTy; CandidateTys.clear(); return createAndCheckVectorTypesForPromotion( DeferredTys, CandidateTysCopy, CheckCandidateType, P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy, HaveCommonVecPtrTy, - CommonVecPtrTy); + CommonVecPtrTy, VScale); } /// Test whether a slice of an alloca is valid for integer widening. @@ -2387,7 +2464,8 @@ static bool isIntegerWideningViableForSlice(const Slice &S, if (LI->isVolatile()) return false; // We can't handle loads that extend past the allocated memory. - if (DL.getTypeStoreSize(LI->getType()).getFixedValue() > Size) + TypeSize LoadSize = DL.getTypeStoreSize(LI->getType()); + if (!LoadSize.isFixed() || LoadSize.getFixedValue() > Size) return false; // So far, AllocaSliceRewriter does not support widening split slice tails // in rewriteIntegerLoad. @@ -2412,7 +2490,8 @@ static bool isIntegerWideningViableForSlice(const Slice &S, if (SI->isVolatile()) return false; // We can't handle stores that extend past the allocated memory. - if (DL.getTypeStoreSize(ValueTy).getFixedValue() > Size) + TypeSize StoreSize = DL.getTypeStoreSize(ValueTy); + if (!StoreSize.isFixed() || StoreSize.getFixedValue() > Size) return false; // So far, AllocaSliceRewriter does not support widening split slice tails // in rewriteIntegerStore. @@ -2885,8 +2964,6 @@ class AllocaSliceRewriter : public InstVisitor { Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8) : LI.getType(); - const bool IsLoadPastEnd = - DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize; bool IsPtrAdjusted = false; Value *V; if (VecTy) { @@ -2896,8 +2973,9 @@ class AllocaSliceRewriter : public InstVisitor { } else if (NewBeginOffset == NewAllocaBeginOffset && NewEndOffset == NewAllocaEndOffset && (canConvertValue(DL, NewAllocaTy, TargetTy) || - (IsLoadPastEnd && NewAllocaTy->isIntegerTy() && - TargetTy->isIntegerTy() && !LI.isVolatile()))) { + (NewAllocaTy->isIntegerTy() && TargetTy->isIntegerTy() && + DL.getTypeStoreSize(TargetTy).getFixedValue() > SliceSize && + !LI.isVolatile()))) { Value *NewPtr = getPtrToNewAI(LI.getPointerAddressSpace(), LI.isVolatile()); LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), NewPtr, @@ -3070,7 +3148,8 @@ class AllocaSliceRewriter : public InstVisitor { if (AllocaInst *AI = dyn_cast(V->stripInBoundsOffsets())) Pass.PostPromotionWorklist.insert(AI); - if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedValue()) { + TypeSize StoreSize = DL.getTypeStoreSize(V->getType()); + if (StoreSize.isFixed() && SliceSize < StoreSize.getFixedValue()) { assert(!SI.isVolatile()); assert(V->getType()->isIntegerTy() && "Only integer type loads and stores are split"); @@ -4846,14 +4925,18 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, Type *SliceTy = nullptr; VectorType *SliceVecTy = nullptr; const DataLayout &DL = AI.getDataLayout(); + unsigned VScale = AI.getFunction()->getVScaleValue(); + std::pair CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()); // Do all uses operate on the same type? - if (CommonUseTy.first) - if (DL.getTypeAllocSize(CommonUseTy.first).getFixedValue() >= P.size()) { + if (CommonUseTy.first) { + TypeSize CommonUseSize = DL.getTypeAllocSize(CommonUseTy.first); + if (CommonUseSize.isFixed() && CommonUseSize.getFixedValue() >= P.size()) { SliceTy = CommonUseTy.first; SliceVecTy = dyn_cast(SliceTy); } + } // If not, can we find an appropriate subtype in the original allocated type? if (!SliceTy) if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), @@ -4874,12 +4957,12 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, // If the common use types are not viable for promotion then attempt to find // another type that is viable. - if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL)) + if (SliceVecTy && !checkVectorTypeForPromotion(P, SliceVecTy, DL, VScale)) if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(), P.beginOffset(), P.size())) { VectorType *TypePartitionVecTy = dyn_cast(TypePartitionTy); if (TypePartitionVecTy && - checkVectorTypeForPromotion(P, TypePartitionVecTy, DL)) + checkVectorTypeForPromotion(P, TypePartitionVecTy, DL, VScale)) SliceTy = TypePartitionTy; } @@ -4890,7 +4973,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS, bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL); VectorType *VecTy = - IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL); + IsIntegerPromotable ? nullptr : isVectorPromotionViable(P, DL, VScale); if (VecTy) SliceTy = VecTy; diff --git a/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll b/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll new file mode 100644 index 0000000000000..85715e406e065 --- /dev/null +++ b/llvm/test/Transforms/SROA/scalable-vectors-with-known-vscale.ll @@ -0,0 +1,349 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG +; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" + +; This test checks that SROA runs mem2reg on scalable vectors. + +define @alloca_nxv16i1( %pg) vscale_range(1) { +; CHECK-LABEL: @alloca_nxv16i1( +; CHECK-NEXT: ret [[PG:%.*]] +; + %pg.addr = alloca + store %pg, ptr %pg.addr + %1 = load , ptr %pg.addr + ret %1 +} + +define @alloca_nxv16i8( %vec) vscale_range(1) { +; CHECK-LABEL: @alloca_nxv16i8( +; CHECK-NEXT: ret [[VEC:%.*]] +; + %vec.addr = alloca + store %vec, ptr %vec.addr + %1 = load , ptr %vec.addr + ret %1 +} + +; Test scalable alloca that can't be promoted. Mem2Reg only considers +; non-volatile loads and stores for promotion. +define @unpromotable_alloca( %vec) vscale_range(1) { +; CHECK-LABEL: @unpromotable_alloca( +; CHECK-NEXT: [[VEC_ADDR:%.*]] = alloca , align 16 +; CHECK-NEXT: store volatile [[VEC:%.*]], ptr [[VEC_ADDR]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load volatile , ptr [[VEC_ADDR]], align 16 +; CHECK-NEXT: ret [[TMP1]] +; + %vec.addr = alloca + store volatile %vec, ptr %vec.addr + %1 = load volatile , ptr %vec.addr + ret %1 +} + +; Test we bail out when using an alloca of a fixed-length vector (VLS) that was +; bitcasted to a scalable vector. +define @cast_alloca_to_svint32_t( %type.coerce) vscale_range(1) { +; CHECK-LABEL: @cast_alloca_to_svint32_t( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32( [[TYPE_COERCE:%.*]], i64 0) +; CHECK-NEXT: [[TYPE_0_VEC_EXPAND:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <16 x i32> +; CHECK-NEXT: [[TYPE_0_VECBLEND:%.*]] = select <16 x i1> , <16 x i32> [[TYPE_0_VEC_EXPAND]], <16 x i32> undef +; CHECK-NEXT: [[TYPE_ADDR_0_VEC_EXTRACT:%.*]] = shufflevector <16 x i32> [[TYPE_0_VECBLEND]], <16 x i32> poison, <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> [[TYPE_ADDR_0_VEC_EXTRACT]], i64 0) +; CHECK-NEXT: ret [[TMP2]] +; + %type = alloca <16 x i32> + %type.addr = alloca <16 x i32> + store %type.coerce, ptr %type + %type1 = load <16 x i32>, ptr %type + store <16 x i32> %type1, ptr %type.addr + %1 = load <16 x i32>, ptr %type.addr + %2 = load , ptr %type.addr + ret %2 +} + +; When casting from VLA to VLS via memory check we bail out when producing a +; GEP where the element type is a scalable vector. +define @cast_alloca_from_svint32_t() vscale_range(1) { +; CHECK-LABEL: @cast_alloca_from_svint32_t( +; CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr [[RETVAL_COERCE]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 16 +; CHECK-NEXT: ret [[TMP1]] +; + %retval = alloca <16 x i32> + store <16 x i32> zeroinitializer, ptr %retval + %retval.coerce = alloca + call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval.coerce, ptr align 16 %retval, i64 64, i1 false) + %1 = load , ptr %retval.coerce + ret %1 +} + +; Test we bail out when using an alloca of a fixed-length vector (VLS) that was +; bitcasted to a scalable vector. +define void @select_load_alloca_to_svdouble_t() vscale_range(1) { +; CHECK-LABEL: @select_load_alloca_to_svdouble_t( +; CHECK-NEXT: [[Z:%.*]] = alloca <16 x half>, align 32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 0, 0 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], ptr [[Z]], ptr null +; CHECK-NEXT: [[VAL:%.*]] = load , ptr [[COND]], align 16 +; CHECK-NEXT: ret void +; + %z = alloca <16 x half> + %cmp = icmp eq i32 0, 0 + %cond = select i1 %cmp, ptr %z, ptr null + %val = load , ptr %cond, align 16 + ret void +} + +define void @select_store_alloca_to_svdouble_t( %val) vscale_range(1) { +; CHECK-LABEL: @select_store_alloca_to_svdouble_t( +; CHECK-NEXT: [[Z:%.*]] = alloca <16 x half>, align 32 +; CHECK-NEXT: [[CMP:%.*]] = icmp eq i32 0, 0 +; CHECK-NEXT: [[COND:%.*]] = select i1 [[CMP]], ptr [[Z]], ptr null +; CHECK-NEXT: store [[VAL:%.*]], ptr [[COND]], align 16 +; CHECK-NEXT: ret void +; + %z = alloca <16 x half> + %cmp = icmp eq i32 0, 0 + %cond = select i1 %cmp, ptr %z, ptr null + store %val, ptr %cond, align 16 + ret void +} + +define <4 x i32> @fixed_alloca_fixed_from_scalable( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32( [[A:%.*]], i64 0) +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define <2 x i8> @fixed_alloca_fixed_from_scalable_requires_bitcast( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_requires_bitcast( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [[A:%.*]] to +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8( [[TMP1]], i64 0) +; CHECK-NEXT: ret <2 x i8> [[TMP2]] +; + %tmp = alloca <2 x i8> + store %a, ptr %tmp + %cast = load <2 x i8>, ptr %tmp + ret <2 x i8> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_inttoptr( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_inttoptr( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast [[A:%.*]] to +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64( [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = inttoptr <2 x i64> [[TMP3]] to <2 x ptr> +; CHECK-NEXT: ret <2 x ptr> [[TMP2]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define <4 x i32> @fixed_alloca_fixed_from_scalable_ptrtoint( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoint( +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint [[A:%.*]] to +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [[TMP1]] to +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = call <4 x i32> @llvm.vector.extract.v4i32.nxv4i32( [[TMP2]], i64 0) +; CHECK-NEXT: ret <4 x i32> [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_ptrtoptr( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoptr( +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.nxv2p0( [[A:%.*]], i64 0) +; CHECK-NEXT: ret <2 x ptr> [[TMP_0_CAST]] +; + %tmp = alloca <2 x ptr> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_ptrtoptr_different_addrspace( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoptr_different_addrspace( +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint [[A:%.*]] to +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64( [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr <2 x i64> [[TMP2]] to <2 x ptr> +; CHECK-NEXT: ret <2 x ptr> [[TMP3]] +; + %tmp = alloca <2 x ptr> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define @fixed_alloca_scalable_from_fixed(<4 x i32> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> [[A:%.*]], i64 0) +; CHECK-NEXT: ret [[TMP1]] +; + %tmp = alloca <4 x i32> + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_requires_bitcast(<2 x i8> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_requires_bitcast( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv2i8.v2i8( poison, <2 x i8> [[A:%.*]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [[TMP1]] to +; CHECK-NEXT: ret [[TMP2]] +; + %tmp = alloca <2 x i8> + store <2 x i8> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_inttoptr(<4 x i32> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_inttoptr( +; CHECK-NEXT: [[TMP1:%.*]] = call @llvm.vector.insert.nxv4i32.v4i32( poison, <4 x i32> [[A:%.*]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast [[TMP1]] to +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = inttoptr [[TMP2]] to +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoint(<2 x ptr> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoint( +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <2 x ptr> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = bitcast [[TMP2]] to +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoptr(<2 x ptr> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoptr( +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = call @llvm.vector.insert.nxv2p0.v2p0( poison, <2 x ptr> [[A:%.*]], i64 0) +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <2 x ptr> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoptr_different_addrspace(<2 x ptr> %a) vscale_range(1) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoptr_different_addrspace( +; CHECK-NEXT: [[TMP1:%.*]] = ptrtoint <2 x ptr> [[A:%.*]] to <2 x i64> +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.insert.nxv2i64.v2i64( poison, <2 x i64> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = inttoptr [[TMP2]] to +; CHECK-NEXT: ret [[TMP3]] +; + %tmp = alloca <2 x ptr> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define <4 x i32> @scalable_alloca_fixed_from_scalable( %a) vscale_range(1) { +; CHECK-LABEL: @scalable_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <4 x i32> [[CAST]] +; + %tmp = alloca + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define @scalable_alloca_scalable_from_fixed(<4 x i32> %a) vscale_range(1) { +; CHECK-LABEL: @scalable_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 +; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[CAST]] +; + %tmp = alloca + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define i16 @scalar_alloca_scalar_from_scalable( %a) vscale_range(1) { +; CHECK-LABEL: @scalar_alloca_scalar_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load i16, ptr [[TMP]], align 2 +; CHECK-NEXT: ret i16 [[TMP_0_CAST]] +; + %tmp = alloca i16 + store %a, ptr %tmp + %cast = load i16, ptr %tmp + ret i16 %cast +} + +define @scalar_alloca_scalable_from_scalar(i16 %a) vscale_range(1) { +; CHECK-LABEL: @scalar_alloca_scalable_from_scalar( +; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +; CHECK-NEXT: store i16 [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 2 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca i16 + store i16 %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define { <2 x i32>, <2 x i32> } @fixed_struct_alloca_fixed_from_scalable( %a) vscale_range(1) { +; CHECK-LABEL: @fixed_struct_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x i32>, <2 x i32> }, align 8 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 8 +; CHECK-NEXT: [[TMP_0_CAST_FCA_0_LOAD:%.*]] = load <2 x i32>, ptr [[TMP]], align 8 +; CHECK-NEXT: [[CAST_FCA_0_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP_0_CAST_FCA_0_LOAD]], 0 +; CHECK-NEXT: [[TMP_8_CAST_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 8 +; CHECK-NEXT: [[TMP_8_CAST_FCA_1_LOAD:%.*]] = load <2 x i32>, ptr [[TMP_8_CAST_FCA_1_GEP_SROA_IDX]], align 8 +; CHECK-NEXT: [[CAST_FCA_1_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[CAST_FCA_0_INSERT]], <2 x i32> [[TMP_8_CAST_FCA_1_LOAD]], 1 +; CHECK-NEXT: ret { <2 x i32>, <2 x i32> } [[CAST_FCA_1_INSERT]] +; + %tmp = alloca { <2 x i32>, <2 x i32> } + store %a, ptr %tmp + %cast = load { <2 x i32>, <2 x i32> }, ptr %tmp + ret { <2 x i32>, <2 x i32> } %cast +} + +define @fixed_struct_alloca_scalable_from_fixed({ <2 x ptr>, <2 x ptr> } %a) vscale_range(1) { +; CHECK-LABEL: @fixed_struct_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x ptr>, <2 x ptr> }, align 16 +; CHECK-NEXT: [[A_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A:%.*]], 0 +; CHECK-NEXT: store <2 x ptr> [[A_FCA_0_EXTRACT]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[A_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A]], 1 +; CHECK-NEXT: [[TMP_16_A_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[TMP]], i64 16 +; CHECK-NEXT: store <2 x ptr> [[A_FCA_1_EXTRACT]], ptr [[TMP_16_A_FCA_1_GEP_SROA_IDX]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca { <2 x ptr>, <2 x ptr> } + store { <2 x ptr>, <2 x ptr> } %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-MODIFY-CFG: {{.*}} +; CHECK-PRESERVE-CFG: {{.*}} diff --git a/llvm/test/Transforms/SROA/scalable-vectors.ll b/llvm/test/Transforms/SROA/scalable-vectors.ll index d892883ce9dc3..346814d9f630e 100644 --- a/llvm/test/Transforms/SROA/scalable-vectors.ll +++ b/llvm/test/Transforms/SROA/scalable-vectors.ll @@ -2,6 +2,8 @@ ; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-PRESERVE-CFG ; RUN: opt < %s -passes='sroa' -S | FileCheck %s --check-prefixes=CHECK,CHECK-MODIFY-CFG +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-n8:16:32:64" + ; This test checks that SROA runs mem2reg on scalable vectors. define @alloca_nxv16i1( %pg) { @@ -67,11 +69,12 @@ define @cast_alloca_to_svint32_t( %type.coe define @cast_alloca_from_svint32_t() { ; CHECK-LABEL: @cast_alloca_from_svint32_t( ; CHECK-NEXT: [[RETVAL_COERCE:%.*]] = alloca , align 16 -; CHECK-NEXT: store <16 x i32> undef, ptr [[RETVAL_COERCE]], align 16 +; CHECK-NEXT: store <16 x i32> zeroinitializer, ptr [[RETVAL_COERCE]], align 16 ; CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[RETVAL_COERCE]], align 16 ; CHECK-NEXT: ret [[TMP1]] ; %retval = alloca <16 x i32> + store <16 x i32> zeroinitializer, ptr %retval %retval.coerce = alloca call void @llvm.memcpy.p0.p0.i64(ptr align 16 %retval.coerce, ptr align 16 %retval, i64 64, i1 false) %1 = load , ptr %retval.coerce @@ -110,6 +113,224 @@ define void @select_store_alloca_to_svdouble_t( %val) { ret void } +define <4 x i32> @fixed_alloca_fixed_from_scalable( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <4 x i32> [[TMP1]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define <2 x i8> @fixed_alloca_fixed_from_scalable_requires_bitcast( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_requires_bitcast( +; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x i8>, align 2 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x i8>, ptr [[TMP]], align 2 +; CHECK-NEXT: ret <2 x i8> [[TMP2]] +; + %tmp = alloca <2 x i8> + store %a, ptr %tmp + %cast = load <2 x i8>, ptr %tmp + ret <2 x i8> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_inttoptr( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_inttoptr( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP2:%.*]] = load <2 x ptr>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <2 x ptr> [[TMP2]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define <4 x i32> @fixed_alloca_fixed_from_scalable_ptrtoint( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoint( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <4 x i32> [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define <2 x ptr> @fixed_alloca_fixed_from_scalable_ptrtoptr( %a) { +; CHECK-LABEL: @fixed_alloca_fixed_from_scalable_ptrtoptr( +; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x ptr>, align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load <2 x ptr>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <2 x ptr> [[CAST]] +; + %tmp = alloca <2 x ptr> + store %a, ptr %tmp + %cast = load <2 x ptr>, ptr %tmp + ret <2 x ptr> %cast +} + +define @fixed_alloca_scalable_from_fixed(<4 x i32> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP1:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[TMP1]] +; + %tmp = alloca <4 x i32> + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_requires_bitcast(<2 x i8> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_requires_bitcast( +; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x i8>, align 2 +; CHECK-NEXT: store <2 x i8> [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP2:%.*]] = load , ptr [[TMP]], align 2 +; CHECK-NEXT: ret [[TMP2]] +; + %tmp = alloca <2 x i8> + store <2 x i8> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_inttoptr(<4 x i32> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_inttoptr( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoint(<2 x ptr> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoint( +; CHECK-NEXT: [[TMP:%.*]] = alloca <4 x i32>, align 16 +; CHECK-NEXT: store <2 x ptr> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca <4 x i32> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define @fixed_alloca_scalable_from_fixed_ptrtoptr(<2 x ptr> %a) { +; CHECK-LABEL: @fixed_alloca_scalable_from_fixed_ptrtoptr( +; CHECK-NEXT: [[TMP:%.*]] = alloca <2 x ptr>, align 16 +; CHECK-NEXT: store <2 x ptr> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[CAST]] +; + %tmp = alloca <2 x ptr> + store <2 x ptr> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define <4 x i32> @scalable_alloca_fixed_from_scalable( %a) { +; CHECK-LABEL: @scalable_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load <4 x i32>, ptr [[TMP]], align 16 +; CHECK-NEXT: ret <4 x i32> [[CAST]] +; + %tmp = alloca + store %a, ptr %tmp + %cast = load <4 x i32>, ptr %tmp + ret <4 x i32> %cast +} + +define @scalable_alloca_scalable_from_fixed(<4 x i32> %a) { +; CHECK-LABEL: @scalable_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca , align 16 +; CHECK-NEXT: store <4 x i32> [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST:%.*]] = load , ptr [[TMP]], align 16 +; CHECK-NEXT: ret [[CAST]] +; + %tmp = alloca + store <4 x i32> %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define i16 @scalar_alloca_scalar_from_scalable( %a) { +; CHECK-LABEL: @scalar_alloca_scalar_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load i16, ptr [[TMP]], align 2 +; CHECK-NEXT: ret i16 [[TMP_0_CAST]] +; + %tmp = alloca i16 + store %a, ptr %tmp + %cast = load i16, ptr %tmp + ret i16 %cast +} + +define @scalar_alloca_scalable_from_scalar(i16 %a) { +; CHECK-LABEL: @scalar_alloca_scalable_from_scalar( +; CHECK-NEXT: [[TMP:%.*]] = alloca i16, align 2 +; CHECK-NEXT: store i16 [[A:%.*]], ptr [[TMP]], align 2 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 2 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca i16 + store i16 %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + +define { <2 x i32>, <2 x i32> } @fixed_struct_alloca_fixed_from_scalable( %a) { +; CHECK-LABEL: @fixed_struct_alloca_fixed_from_scalable( +; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x i32>, <2 x i32> }, align 8 +; CHECK-NEXT: store [[A:%.*]], ptr [[TMP]], align 16 +; CHECK-NEXT: [[CAST_FCA_0_GEP:%.*]] = getelementptr inbounds { <2 x i32>, <2 x i32> }, ptr [[TMP]], i32 0, i32 0 +; CHECK-NEXT: [[TMP_0_CAST_FCA_0_LOAD:%.*]] = load <2 x i32>, ptr [[CAST_FCA_0_GEP]], align 8 +; CHECK-NEXT: [[CAST_FCA_0_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } poison, <2 x i32> [[TMP_0_CAST_FCA_0_LOAD]], 0 +; CHECK-NEXT: [[TMP_8_CAST_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds { <2 x i32>, <2 x i32> }, ptr [[TMP]], i32 0, i32 1 +; CHECK-NEXT: [[TMP_8_CAST_FCA_1_LOAD:%.*]] = load <2 x i32>, ptr [[TMP_8_CAST_FCA_1_GEP_SROA_IDX]], align 8 +; CHECK-NEXT: [[CAST_FCA_1_INSERT:%.*]] = insertvalue { <2 x i32>, <2 x i32> } [[CAST_FCA_0_INSERT]], <2 x i32> [[TMP_8_CAST_FCA_1_LOAD]], 1 +; CHECK-NEXT: ret { <2 x i32>, <2 x i32> } [[CAST_FCA_1_INSERT]] +; + %tmp = alloca { <2 x i32>, <2 x i32> } + store %a, ptr %tmp + %cast = load { <2 x i32>, <2 x i32> }, ptr %tmp + ret { <2 x i32>, <2 x i32> } %cast +} + +define @fixed_struct_alloca_scalable_from_fixed({ <2 x ptr>, <2 x ptr> } %a) { +; CHECK-LABEL: @fixed_struct_alloca_scalable_from_fixed( +; CHECK-NEXT: [[TMP:%.*]] = alloca { <2 x ptr>, <2 x ptr> }, align 16 +; CHECK-NEXT: [[A_FCA_0_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A:%.*]], 0 +; CHECK-NEXT: [[A_FCA_0_GEP:%.*]] = getelementptr inbounds { <2 x ptr>, <2 x ptr> }, ptr [[TMP]], i32 0, i32 0 +; CHECK-NEXT: store <2 x ptr> [[A_FCA_0_EXTRACT]], ptr [[A_FCA_0_GEP]], align 16 +; CHECK-NEXT: [[A_FCA_1_EXTRACT:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[A]], 1 +; CHECK-NEXT: [[TMP_16_A_FCA_1_GEP_SROA_IDX:%.*]] = getelementptr inbounds { <2 x ptr>, <2 x ptr> }, ptr [[TMP]], i32 0, i32 1 +; CHECK-NEXT: store <2 x ptr> [[A_FCA_1_EXTRACT]], ptr [[TMP_16_A_FCA_1_GEP_SROA_IDX]], align 16 +; CHECK-NEXT: [[TMP_0_CAST:%.*]] = load , ptr [[TMP]], align 32 +; CHECK-NEXT: ret [[TMP_0_CAST]] +; + %tmp = alloca { <2 x ptr>, <2 x ptr> } + store { <2 x ptr>, <2 x ptr> } %a, ptr %tmp + %cast = load , ptr %tmp + ret %cast +} + declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture, i64, i1) nounwind ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK-MODIFY-CFG: {{.*}} From ddef9ce8dad611c2fef172f3b08c5c98235a3b41 Mon Sep 17 00:00:00 2001 From: CHANDRA GHALE Date: Wed, 11 Jun 2025 15:39:16 +0530 Subject: [PATCH 053/851] LLVM Buildbot failure on openmp runtime test (#143674) Error looks to be missing includes for complex number support in some system. Removing test for now. Relevant PR : [PR-134709](https://github.com/llvm/llvm-project/pull/134709) ``` .---command stderr------------ # | /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp:78:42: error: use of undeclared identifier 'I' # | 78 | double _Complex expected = 0.0 + 0.0 * I; # | | ^ # | /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp:79:40: error: use of undeclared identifier 'I' # | 79 | double _Complex result = 0.0 + 0.0 * I; # | | ^ # | /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp:84:22: error: use of undeclared identifier 'I' # | 84 | arr[i] = i - i * I; # | | ^ # | /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp:92:19: error: use of undeclared identifier 'creal' # | 92 | real_sum += creal(arr[i]); # | | ^~~~~ # | /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp:93:19: error: use of undeclared identifier 'cimag' # | 93 | imag_sum += cimag(arr[i]); # | | ^~~~~ # | /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp:96:36: error: use of undeclared identifier 'I' # | 96 | result = real_sum + imag_sum * I; # | | ^ # | /home/uweigand/sandbox/buildbot/openmp-s390x-linux/llvm.src/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp:97:9: error: use of undeclared identifier 'cabs' # | 97 | if (cabs(result - expected) > 1e-6) { # | | ^~~~ # | 7 errors generated. ``` Co-authored-by: Chandra Ghale --- .../for/omp_for_private_reduction.cpp | 34 ++----------------- 1 file changed, 3 insertions(+), 31 deletions(-) diff --git a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp index 9bf3be1e9e45d..4520755a8a305 100644 --- a/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp +++ b/openmp/runtime/test/worksharing/for/omp_for_private_reduction.cpp @@ -73,34 +73,6 @@ void performMinMaxRed(int &min_val, int &max_val) { max_val = input_data[i]; } } -int performComplexReduction() { - double _Complex arr[N]; - double _Complex expected = 0.0 + 0.0 * I; - double _Complex result = 0.0 + 0.0 * I; - int error = 0; - - // Initialize the array and compute serial sum - for (int i = 0; i < N; ++i) { - arr[i] = i - i * I; - expected += arr[i]; - } - double real_sum = 0.0, imag_sum = 0.0; -#pragma omp parallel private(real_sum) private(imag_sum) - { -#pragma omp for reduction(+ : real_sum, imag_sum) - for (int i = 0; i < N; ++i) { - real_sum += creal(arr[i]); - imag_sum += cimag(arr[i]); - } - - result = real_sum + imag_sum * I; - if (cabs(result - expected) > 1e-6) { - error++; - } - } - return error; -} - std::complex doComplexReduction(std::complex *arr) { std::complex result(1, 0); @@ -138,7 +110,8 @@ int main(void) { const float kPiVal = 3.14f; const int kExpectedSum = 45; // Sum of 0..9 const int kExpectedProd = 3628800; // 10! - const float kExpectedFsum = kPiVal * N; // 3.14f * 10 + const float kExpectedFsum = 31.400000f; // 3.14f * 10 + const float kTolerance = 1e-4f; const int kExpectedMin = 3; const int kExpectedMax = 12; std::complex arr[N]; @@ -163,7 +136,7 @@ int main(void) { total_errors++; if (t_prod_v != kExpectedProd) total_errors++; - if (t_fsum_v != kExpectedFsum) + if (std::abs(t_fsum_v - kExpectedFsum) > kTolerance) total_errors++; } #pragma omp parallel num_threads(4) @@ -177,7 +150,6 @@ int main(void) { total_errors++; } total_errors += checkUserDefinedReduction(); - total_errors += performComplexReduction(); #pragma omp parallel num_threads(4) { std::complex result(1, 0); From 354cfba5209eed5ea6bafb6a3e69e65148c4e25d Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Wed, 11 Jun 2025 11:23:24 +0100 Subject: [PATCH 054/851] [DebugInfo][RemoveDIs] Remove scoped-dbg-format-setter (#143450) This was a utility for flipping between intrinsic and debug record mode -- we don't need it any more. The "IsNewDbgInfoFormat" should be true everywhere. --- .../include/llvm/IR/DebugProgramInstruction.h | 19 ------------------- llvm/include/llvm/IR/PassManagerImpl.h | 4 ---- llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp | 8 ++------ llvm/lib/CodeGen/MIRPrinter.cpp | 6 ------ llvm/lib/IR/IRPrintingPasses.cpp | 3 --- llvm/lib/IR/LegacyPassManager.cpp | 5 ----- llvm/lib/IRPrinter/IRPrintingPasses.cpp | 3 --- llvm/lib/Linker/IRMover.cpp | 3 --- .../Transforms/IPO/ThinLTOBitcodeWriter.cpp | 4 +--- mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp | 4 ---- 10 files changed, 3 insertions(+), 56 deletions(-) diff --git a/llvm/include/llvm/IR/DebugProgramInstruction.h b/llvm/include/llvm/IR/DebugProgramInstruction.h index 1436677e5a085..e0292c2b8d2d2 100644 --- a/llvm/include/llvm/IR/DebugProgramInstruction.h +++ b/llvm/include/llvm/IR/DebugProgramInstruction.h @@ -692,25 +692,6 @@ getDbgRecordRange(DbgMarker *DebugMarker) { DEFINE_ISA_CONVERSION_FUNCTIONS(DbgRecord, LLVMDbgRecordRef) -/// Used to temporarily set the debug info format of a function, module, or -/// basic block for the duration of this object's lifetime, after which the -/// prior state will be restored. -template class ScopedDbgInfoFormatSetter { - T &Obj; - bool OldState; - -public: - ScopedDbgInfoFormatSetter(T &Obj, bool NewState) - : Obj(Obj), OldState(Obj.IsNewDbgInfoFormat) { - Obj.setIsNewDbgInfoFormat(NewState); - } - ~ScopedDbgInfoFormatSetter() { Obj.setIsNewDbgInfoFormat(OldState); } -}; - -template -ScopedDbgInfoFormatSetter(T &Obj, - bool NewState) -> ScopedDbgInfoFormatSetter; - } // namespace llvm #endif // LLVM_IR_DEBUGPROGRAMINSTRUCTION_H diff --git a/llvm/include/llvm/IR/PassManagerImpl.h b/llvm/include/llvm/IR/PassManagerImpl.h index fe7b35fbce2c5..ade13f10c54e4 100644 --- a/llvm/include/llvm/IR/PassManagerImpl.h +++ b/llvm/include/llvm/IR/PassManagerImpl.h @@ -63,10 +63,6 @@ PreservedAnalyses PassManager::run( detail::getAnalysisResult( AM, IR, std::tuple(ExtraArgs...)); - // RemoveDIs: if requested, convert debug-info to DbgRecord representation - // for duration of these passes. - ScopedDbgInfoFormatSetter FormatSetter(IR, true); - StackTraceEntry Entry(PI, IR); for (auto &Pass : Passes) { Entry.setPass(&*Pass); diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp index fb393d33df3b2..e48f735ded831 100644 --- a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp +++ b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp @@ -19,9 +19,7 @@ using namespace llvm; PreservedAnalyses BitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) { - ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat); - if (M.IsNewDbgInfoFormat) - M.removeDebugIntrinsicDeclarations(); + M.removeDebugIntrinsicDeclarations(); const ModuleSummaryIndex *Index = EmitSummaryIndex ? &(AM.getResult(M)) @@ -51,9 +49,7 @@ namespace { StringRef getPassName() const override { return "Bitcode Writer"; } bool runOnModule(Module &M) override { - ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat); - if (M.IsNewDbgInfoFormat) - M.removeDebugIntrinsicDeclarations(); + M.removeDebugIntrinsicDeclarations(); WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, /*Index=*/nullptr, /*EmitModuleHash=*/false); diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index 34ac0794f901f..7710b503facc3 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -965,17 +965,11 @@ void MIRFormatter::printIRValue(raw_ostream &OS, const Value &V, } void llvm::printMIR(raw_ostream &OS, const Module &M) { - ScopedDbgInfoFormatSetter FormatSetter(const_cast(M), true); - yaml::Output Out(OS); Out << const_cast(M); } void llvm::printMIR(raw_ostream &OS, const MachineModuleInfo &MMI, const MachineFunction &MF) { - // RemoveDIs: as there's no textual form for DbgRecords yet, print debug-info - // in dbg.value format. - ScopedDbgInfoFormatSetter FormatSetter( - const_cast(MF.getFunction()), true); printMF(OS, MMI, MF); } diff --git a/llvm/lib/IR/IRPrintingPasses.cpp b/llvm/lib/IR/IRPrintingPasses.cpp index eb35377d0fb23..5c062800198fc 100644 --- a/llvm/lib/IR/IRPrintingPasses.cpp +++ b/llvm/lib/IR/IRPrintingPasses.cpp @@ -40,7 +40,6 @@ class PrintModulePassWrapper : public ModulePass { ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {} bool runOnModule(Module &M) override { - ScopedDbgInfoFormatSetter FormatSetter(M, true); // Remove intrinsic declarations when printing in the new format. // TODO: consider removing this as debug-intrinsics are gone. M.removeDebugIntrinsicDeclarations(); @@ -84,8 +83,6 @@ class PrintFunctionPassWrapper : public FunctionPass { // This pass just prints a banner followed by the function as it's processed. bool runOnFunction(Function &F) override { - ScopedDbgInfoFormatSetter FormatSetter(F, true); - if (isFunctionInPrintList(F.getName())) { if (forcePrintModuleIR()) OS << Banner << " (function: " << F.getName() << ")\n" diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp index fd69e309cdf10..c8f1606ea06cb 100644 --- a/llvm/lib/IR/LegacyPassManager.cpp +++ b/llvm/lib/IR/LegacyPassManager.cpp @@ -526,11 +526,6 @@ bool PassManagerImpl::run(Module &M) { dumpArguments(); dumpPasses(); - // RemoveDIs: if a command line flag is given, convert to the - // DbgVariableRecord representation of debug-info for the duration of these - // passes. - ScopedDbgInfoFormatSetter FormatSetter(M, true); - for (ImmutablePass *ImPass : getImmutablePasses()) Changed |= ImPass->doInitialization(M); diff --git a/llvm/lib/IRPrinter/IRPrintingPasses.cpp b/llvm/lib/IRPrinter/IRPrintingPasses.cpp index 5fd6a094fa57b..81ad284ea1642 100644 --- a/llvm/lib/IRPrinter/IRPrintingPasses.cpp +++ b/llvm/lib/IRPrinter/IRPrintingPasses.cpp @@ -32,7 +32,6 @@ PrintModulePass::PrintModulePass(raw_ostream &OS, const std::string &Banner, EmitSummaryIndex(EmitSummaryIndex) {} PreservedAnalyses PrintModulePass::run(Module &M, ModuleAnalysisManager &AM) { - ScopedDbgInfoFormatSetter FormatSetter(M, true); // Remove intrinsic declarations when printing in the new format. // TODO: consider removing this now that debug intrinsics are gone. M.removeDebugIntrinsicDeclarations(); @@ -72,8 +71,6 @@ PrintFunctionPass::PrintFunctionPass(raw_ostream &OS, const std::string &Banner) PreservedAnalyses PrintFunctionPass::run(Function &F, FunctionAnalysisManager &) { - ScopedDbgInfoFormatSetter FormatSetter(F, true); - if (isFunctionInPrintList(F.getName())) { if (forcePrintModuleIR()) OS << Banner << " (function: " << F.getName() << ")\n" << *F.getParent(); diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp index 4dd5ae81c89c1..a449185b2b9ba 100644 --- a/llvm/lib/Linker/IRMover.cpp +++ b/llvm/lib/Linker/IRMover.cpp @@ -1444,9 +1444,6 @@ Error IRLinker::run() { if (Error Err = SrcM->getMaterializer()->materializeMetadata()) return Err; - // Convert source module to match dest for the duration of the link. - ScopedDbgInfoFormatSetter FormatSetter(*SrcM, DstM.IsNewDbgInfoFormat); - // Inherit the target data from the source module if the destination // module doesn't have one already. if (DstM.getDataLayout().isDefault()) diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index 06f5d78d77e01..e276376f21583 100644 --- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -584,9 +584,7 @@ llvm::ThinLTOBitcodeWriterPass::run(Module &M, ModuleAnalysisManager &AM) { FunctionAnalysisManager &FAM = AM.getResult(M).getManager(); - ScopedDbgInfoFormatSetter FormatSetter(M, M.IsNewDbgInfoFormat); - if (M.IsNewDbgInfoFormat) - M.removeDebugIntrinsicDeclarations(); + M.removeDebugIntrinsicDeclarations(); bool Changed = writeThinLTOBitcode( OS, ThinLinkOS, diff --git a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp index 73e8626db3a09..75170bffcdf21 100644 --- a/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp +++ b/mlir/lib/Target/LLVMIR/ConvertToLLVMIR.cpp @@ -32,10 +32,6 @@ void registerToLLVMIRTranslation() { if (!llvmModule) return failure(); - // When printing LLVM IR, we should convert the module to the debug info - // format that LLVM expects us to print. - // See https://llvm.org/docs/RemoveDIsDebugInfo.html - llvm::ScopedDbgInfoFormatSetter formatSetter(*llvmModule, true); llvmModule->removeDebugIntrinsicDeclarations(); llvmModule->print(output, nullptr); return success(); From 79a72c47d09c2e2cee645430f9d290c20d2618f1 Mon Sep 17 00:00:00 2001 From: AZero13 Date: Wed, 11 Jun 2025 06:29:37 -0400 Subject: [PATCH 055/851] [AArch64] Consider negated powers of 2 when calculating throughput cost (#143013) Negated powers of 2 have similar or (exact in the case of remainder) codegen with lowering sdiv. In the case of sdiv, it just negates the result in the end anyway, so nothing dissimilar at all. --- .../AArch64/AArch64TargetTransformInfo.cpp | 24 ++++--- llvm/test/Analysis/CostModel/AArch64/div.ll | 36 +++++----- llvm/test/Analysis/CostModel/AArch64/rem.ll | 36 +++++----- .../Analysis/CostModel/AArch64/sve-div.ll | 72 +++++++++---------- .../Analysis/CostModel/AArch64/sve-rem.ll | 72 +++++++++---------- 5 files changed, 124 insertions(+), 116 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 6e9a35c462fc9..acd37a5ae0720 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4005,7 +4005,8 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( // have similar cost. auto VT = TLI->getValueType(DL, Ty); if (VT.isScalarInteger() && VT.getSizeInBits() <= 64) { - if (Op2Info.isPowerOf2()) { + if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) { + // Neg can be folded into the asr instruction. return ISD == ISD::SDIV ? (3 * AddCost + AsrCost) : (3 * AsrCost + AddCost); } else { @@ -4013,17 +4014,24 @@ InstructionCost AArch64TTIImpl::getArithmeticInstrCost( } } else if (VT.isVector()) { InstructionCost UsraCost = 2 * AsrCost; - if (Op2Info.isPowerOf2()) { + if (Op2Info.isPowerOf2() || Op2Info.isNegatedPowerOf2()) { // Division with scalable types corresponds to native 'asrd' // instruction when SVE is available. // e.g. %1 = sdiv %a, splat (i32 8) + + // One more for the negation in SDIV + InstructionCost Cost = + (Op2Info.isNegatedPowerOf2() && ISD == ISD::SDIV) ? AsrCost : 0; if (Ty->isScalableTy() && ST->hasSVE()) - return 2 * AsrCost; - return UsraCost + - (ISD == ISD::SDIV - ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * - AsrCost - : 2 * AddCost); + Cost += 2 * AsrCost; + else { + Cost += + UsraCost + + (ISD == ISD::SDIV + ? (LT.second.getScalarType() == MVT::i64 ? 1 : 2) * AsrCost + : 2 * AddCost); + } + return Cost; } else if (LT.second == MVT::v2i64) { return VT.getVectorNumElements() * getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind, diff --git a/llvm/test/Analysis/CostModel/AArch64/div.ll b/llvm/test/Analysis/CostModel/AArch64/div.ll index 5367344ce573f..3a2358dba51b2 100644 --- a/llvm/test/Analysis/CostModel/AArch64/div.ll +++ b/llvm/test/Analysis/CostModel/AArch64/div.ll @@ -870,27 +870,27 @@ define void @sdiv_uniformconstnegpow2() { ; CHECK-LABEL: 'sdiv_uniformconstnegpow2' ; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %I128 = sdiv i128 undef, -16 ; CHECK-NEXT: Cost Model: Found costs of 4 for: %I64 = sdiv i64 undef, -16 -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = sdiv <2 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i64 = sdiv <2 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16) ; CHECK-NEXT: Cost Model: Found costs of 4 for: %I32 = sdiv i32 undef, -16 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16) ; CHECK-NEXT: Cost Model: Found costs of 4 for: %I16 = sdiv i16 undef, -16 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16) ; CHECK-NEXT: Cost Model: Found costs of 4 for: %I8 = sdiv i8 undef, -16 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16) ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %I128 = sdiv i128 undef, -16 diff --git a/llvm/test/Analysis/CostModel/AArch64/rem.ll b/llvm/test/Analysis/CostModel/AArch64/rem.ll index d684e3af00b83..2fa62f1705911 100644 --- a/llvm/test/Analysis/CostModel/AArch64/rem.ll +++ b/llvm/test/Analysis/CostModel/AArch64/rem.ll @@ -870,27 +870,27 @@ define void @srem_uniformconstnegpow2() { ; CHECK-LABEL: 'srem_uniformconstnegpow2' ; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %I128 = srem i128 undef, -16 ; CHECK-NEXT: Cost Model: Found costs of 4 for: %I64 = srem i64 undef, -16 -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = srem <2 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i64 = srem <2 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, splat (i64 -16) ; CHECK-NEXT: Cost Model: Found costs of 4 for: %I32 = srem i32 undef, -16 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = srem <2 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = srem <4 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i32 = srem <2 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i32 = srem <4 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, splat (i32 -16) ; CHECK-NEXT: Cost Model: Found costs of 4 for: %I16 = srem i16 undef, -16 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = srem <2 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = srem <4 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = srem <8 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i16 = srem <2 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i16 = srem <4 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i16 = srem <8 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, splat (i16 -16) ; CHECK-NEXT: Cost Model: Found costs of 4 for: %I8 = srem i8 undef, -16 -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = srem <2 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = srem <4 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = srem <8 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = srem <16 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i8 = srem <2 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i8 = srem <4 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i8 = srem <8 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16i8 = srem <16 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, splat (i8 -16) ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %I128 = srem i128 undef, -16 diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-div.ll b/llvm/test/Analysis/CostModel/AArch64/sve-div.ll index 480c3146a210d..c055d3218f65b 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-div.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-div.ll @@ -479,42 +479,42 @@ define void @udiv_uniformconstpow2() { define void @sdiv_uniformconstnegpow2() { ; CHECK-LABEL: 'sdiv_uniformconstnegpow2' -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = sdiv <2 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = sdiv undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = sdiv undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = sdiv undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = sdiv undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = sdiv undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = sdiv undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = sdiv undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = sdiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = sdiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = sdiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = sdiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = sdiv undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = sdiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = sdiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = sdiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = sdiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = sdiv undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = sdiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i64 = sdiv <2 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = sdiv <4 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = sdiv <8 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = sdiv <2 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = sdiv <4 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = sdiv <8 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = sdiv <16 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = sdiv <2 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = sdiv <4 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = sdiv <8 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = sdiv <16 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = sdiv <32 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = sdiv <2 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = sdiv <4 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = sdiv <8 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = sdiv <16 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = sdiv <32 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = sdiv <64 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = sdiv undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = sdiv undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = sdiv undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = sdiv undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = sdiv undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = sdiv undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = sdiv undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = sdiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = sdiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = sdiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = sdiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = sdiv undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = sdiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = sdiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = sdiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:3 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = sdiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = sdiv undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = sdiv undef, splat (i8 -16) ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = sdiv <2 x i64> undef, splat (i64 -16) diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll b/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll index e2488735de4b5..eac8b66bcd216 100644 --- a/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll +++ b/llvm/test/Analysis/CostModel/AArch64/sve-rem.ll @@ -491,43 +491,43 @@ define void @urem_uniformconstpow2() { define void @srem_uniformconstnegpow2() { ; CHECK-LABEL: 'srem_uniformconstnegpow2' -; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V2i64 = srem <2 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:32 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i32 = srem <2 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i32 = srem <4 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i16 = srem <2 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i16 = srem <4 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i16 = srem <8 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V2i8 = srem <2 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V4i8 = srem <4 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V8i8 = srem <8 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:6 CodeSize:4 Lat:4 SizeLat:4 for: %V16i8 = srem <16 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:12 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:24 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i64 = srem <2 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V4i64 = srem <4 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V8i64 = srem <8 x i64> undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i32 = srem <2 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i32 = srem <4 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V8i32 = srem <8 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V16i32 = srem <16 x i32> undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i16 = srem <2 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i16 = srem <4 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i16 = srem <8 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V16i16 = srem <16 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V32i16 = srem <32 x i16> undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V2i8 = srem <2 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V4i8 = srem <4 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V8i8 = srem <8 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %V16i8 = srem <16 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %V32i8 = srem <32 x i8> undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:16 CodeSize:4 Lat:4 SizeLat:4 for: %V64i8 = srem <64 x i8> undef, splat (i8 -16) ; CHECK-NEXT: Cost Model: Found costs of RThru:Invalid CodeSize:4 Lat:4 SizeLat:4 for: %NV2i128 = srem undef, splat (i128 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = srem undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i64 = srem undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = srem undef, splat (i64 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = srem undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = srem undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i32 = srem undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = srem undef, splat (i32 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = srem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = srem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = srem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i16 = srem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = srem undef, splat (i16 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = srem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = srem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = srem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:5 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = srem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:10 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i8 = srem undef, splat (i8 -16) -; CHECK-NEXT: Cost Model: Found costs of RThru:20 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = srem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i64 = srem undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV4i64 = srem undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i64 = srem undef, splat (i64 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i32 = srem undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i32 = srem undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV8i32 = srem undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i32 = srem undef, splat (i32 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i16 = srem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i16 = srem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i16 = srem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV16i16 = srem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV32i16 = srem undef, splat (i16 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV2i8 = srem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV4i8 = srem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV8i8 = srem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:2 CodeSize:4 Lat:4 SizeLat:4 for: %NV16i8 = srem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of 4 for: %NV32i8 = srem undef, splat (i8 -16) +; CHECK-NEXT: Cost Model: Found costs of RThru:8 CodeSize:4 Lat:4 SizeLat:4 for: %NV64i8 = srem undef, splat (i8 -16) ; CHECK-NEXT: Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void ; %V2i64 = srem <2 x i64> undef, splat (i64 -16) From 40cc7b4578fd2d65aaef8356fbe7caf2d84a8f3e Mon Sep 17 00:00:00 2001 From: Tomas Matheson Date: Wed, 11 Jun 2025 11:45:23 +0100 Subject: [PATCH 056/851] [clang][AArch64] test -cc1 -print-enabled-extensions (#143570) This adds tests that document how -cc1 and -print-enabled-extensions interact. The current behaviour looks wrong, and is caused by the fact that --print-enabled-extensions uses the MC subtarget feature API to determine the list of extensions to print, whereas the frontend uses the TargetParser API. The latter does no dependency expansion for the -target-feature flags but the MC API does. This doesn't fix anything but at least it documents the current behaviour, and will serve as a pre-commit test for any future fixes. --- .../aarch64-print-enabled-extensions-cc1.c | 139 ++++++++++++++++++ 1 file changed, 139 insertions(+) create mode 100644 clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c diff --git a/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c b/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c new file mode 100644 index 0000000000000..5d65fdafaa251 --- /dev/null +++ b/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c @@ -0,0 +1,139 @@ +// Test how -cc1 -target-feature interacts with -print-enabled-extensions. +// The current behaviour does not look correct, since dependent features are +// removed from the printed list when one of their dependencies are disabled, +// but they are actually still enabled during compilation, and then actually +// disabled for parsing assembly. + +// REQUIRES: aarch64-registered-target + +// Behaviour with two positive features. +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \ +// RUN: -target-feature +neon -target-feature +sve \ +// RUN: | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=POS_ONLY + +// Negative -target-feature disables the extension but keeps any dependencies of it (FEAT_FP16). +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \ +// RUN: -target-feature +neon -target-feature +sve -target-feature -sve \ +// RUN: | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=POS_NEG + +// Disabling then re-enabling a feature is the same as never disabling it. +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \ +// RUN: -target-feature +neon -target-feature -sve -target-feature +sve \ +// RUN: | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=POS_ONLY + +// Disabling then re-enabling a feature is the same as never disabling it. +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \ +// RUN: -target-feature +neon -target-feature +sve -target-feature -sve -target-feature +sve \ +// RUN: | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=POS_ONLY + +// Only disabling it is the same as never having enabled it. +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \ +// RUN: -target-feature +neon \ +// RUN: | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=NEG_ONLY + +// Only disabling it is the same as never having enabled it. +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \ +// RUN: -target-feature +neon -target-feature -sve \ +// RUN: | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=NEG_ONLY + +// Disabling a dependency (after enabling the dependent) appears to disable the dependent feature. +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \ +// RUN: -target-feature +sve2 -target-feature -sve \ +// RUN: | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=DISABLE_DEP + +// Disabling a dependency before enabling the dependent appears to have no effect. +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \ +// RUN: -target-feature -sve -target-feature +sve2 \ +// RUN: | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=DISABLE_DEP2 + +// Disabling a dependency before enabling the dependent appears to have no effect. +// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -print-enabled-extensions \ +// RUN: -target-feature +sve2 \ +// RUN: | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=DISABLE_DEP2 + +// Driver --print-enabled-extensions indicates that negative -target-features disable dependent features. +// RUN: %clang --target=aarch64 -march=armv8-a+sve2 --print-enabled-extensions \ +// RUN: -Xclang -target-feature -Xclang -sve \ +// RUN: | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=DISABLE_VIA_XCLANG + +// However, sve2 is actually enabled in clang but disabled for MC. +// RUN: %clang --target=aarch64 -march=armv8-a+sve2 -c %s \ +// RUN: -Xclang -target-feature -Xclang -sve \ +// RUN: -Xclang -verify -Xclang -verify-ignore-unexpected=note + + +// POS_ONLY: Extensions enabled for the given AArch64 target +// POS_ONLY-EMPTY: +// POS_ONLY-NEXT: Architecture Feature(s) Description +// POS_ONLY-NEXT: FEAT_AdvSIMD Enable Advanced SIMD instructions +// POS_ONLY-NEXT: FEAT_ETE Enable Embedded Trace Extension +// POS_ONLY-NEXT: FEAT_FP Enable Armv8.0-A Floating Point Extensions +// POS_ONLY-NEXT: FEAT_FP16 Enable half-precision floating-point data processing +// POS_ONLY-NEXT: FEAT_SVE Enable Scalable Vector Extension (SVE) instructions +// POS_ONLY-NEXT: FEAT_TRBE Enable Trace Buffer Extension + +// POS_NEG: Extensions enabled for the given AArch64 target +// POS_NEG-EMPTY: +// POS_NEG-NEXT: Architecture Feature(s) Description +// POS_NEG-NEXT: FEAT_AdvSIMD Enable Advanced SIMD instructions +// POS_NEG-NEXT: FEAT_ETE Enable Embedded Trace Extension +// POS_NEG-NEXT: FEAT_FP Enable Armv8.0-A Floating Point Extensions +// POS_NEG-NEXT: FEAT_FP16 Enable half-precision floating-point data processing +// POS_NEG-NEXT: FEAT_TRBE Enable Trace Buffer Extension + +// NEG_POS: Extensions enabled for the given AArch64 target +// NEG_POS-EMPTY: +// NEG_POS-NEXT: Architecture Feature(s) Description +// NEG_POS-NEXT: FEAT_AdvSIMD Enable Advanced SIMD instructions +// NEG_POS-NEXT: FEAT_ETE Enable Embedded Trace Extension +// NEG_POS-NEXT: FEAT_FP Enable Armv8.0-A Floating Point Extensions +// NEG_POS-NEXT: FEAT_FP16 Enable half-precision floating-point data processing +// NEG_POS-NEXT: FEAT_SVE Enable Scalable Vector Extension (SVE) instructions +// NEG_POS-NEXT: FEAT_TRBE Enable Trace Buffer Extension + +// NEG_ONLY: Extensions enabled for the given AArch64 target +// NEG_ONLY-EMPTY: +// NEG_ONLY-NEXT: Architecture Feature(s) Description +// NEG_ONLY-NEXT: FEAT_AdvSIMD Enable Advanced SIMD instructions +// NEG_ONLY-NEXT: FEAT_ETE Enable Embedded Trace Extension +// NEG_ONLY-NEXT: FEAT_FP Enable Armv8.0-A Floating Point Extensions +// NEG_ONLY-NEXT: FEAT_TRBE Enable Trace Buffer Extension + +// DISABLE_DEP: Extensions enabled for the given AArch64 target +// DISABLE_DEP-EMPTY: +// DISABLE_DEP-NEXT: Architecture Feature(s) Description +// DISABLE_DEP-NEXT: FEAT_AdvSIMD Enable Advanced SIMD instructions +// DISABLE_DEP-NEXT: FEAT_ETE Enable Embedded Trace Extension +// DISABLE_DEP-NEXT: FEAT_FP Enable Armv8.0-A Floating Point Extensions +// DISABLE_DEP-NEXT: FEAT_FP16 Enable half-precision floating-point data processing +// DISABLE_DEP-NEXT: FEAT_TRBE Enable Trace Buffer Extension + +// DISABLE_DEP2: Extensions enabled for the given AArch64 target +// DISABLE_DEP2-EMPTY: +// DISABLE_DEP2-NEXT: Architecture Feature(s) Description +// DISABLE_DEP2-NEXT: FEAT_AdvSIMD Enable Advanced SIMD instructions +// DISABLE_DEP2-NEXT: FEAT_ETE Enable Embedded Trace Extension +// DISABLE_DEP2-NEXT: FEAT_FP Enable Armv8.0-A Floating Point Extensions +// DISABLE_DEP2-NEXT: FEAT_FP16 Enable half-precision floating-point data processing +// DISABLE_DEP2-NEXT: FEAT_SVE Enable Scalable Vector Extension (SVE) instructions +// DISABLE_DEP2-NEXT: FEAT_SVE2 Enable Scalable Vector Extension 2 (SVE2) instructions +// DISABLE_DEP2-NEXT: FEAT_TRBE Enable Trace Buffer Extension + +// DISABLE_VIA_XCLANG: Extensions enabled for the given AArch64 target +// DISABLE_VIA_XCLANG-EMPTY: +// DISABLE_VIA_XCLANG-NEXT: Architecture Feature(s) Description +// DISABLE_VIA_XCLANG-NEXT: FEAT_AdvSIMD Enable Advanced SIMD instructions +// DISABLE_VIA_XCLANG-NEXT: FEAT_ETE Enable Embedded Trace Extension +// DISABLE_VIA_XCLANG-NEXT: FEAT_FP Enable Armv8.0-A Floating Point Extensions +// DISABLE_VIA_XCLANG-NEXT: FEAT_FP16 Enable half-precision floating-point data processing +// DISABLE_VIA_XCLANG-NEXT: FEAT_TRBE Enable Trace Buffer Extension + +#if __ARM_FEATURE_SVE2 +#warning "SVE2 is enabled" +// expected-warning@-1 {{SVE2 is enabled}} +#endif + +void fn_that_requires_sve2() { + __asm__("ldnt1sh z0.s, p0/z, [z1.s]"); + // expected-error@-1 {{instruction requires: sve2}} +} From 19b0e1227ca6653405e4a34627d04a14f2287f26 Mon Sep 17 00:00:00 2001 From: Luke Lau Date: Wed, 11 Jun 2025 13:27:14 +0200 Subject: [PATCH 057/851] [ConstantFolding] Fold sqrt poison -> poison (#141821) I noticed this when a sqrt produced by VectorCombine with a poison operand wasn't getting folded away to poison. Most intrinsics in general could probably be folded to poison if one of their arguments are poison too. Are there any exceptions to this we need to be aware of? --- llvm/lib/Analysis/ConstantFolding.cpp | 7 ++- .../InstSimplify/fp-undef-poison.ll | 50 +++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 23ea6966fbf6c..1ef0badd23757 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -2223,8 +2223,13 @@ static Constant *ConstantFoldScalarCall1(StringRef Name, if (isa(Operands[0])) { // TODO: All of these operations should probably propagate poison. - if (IntrinsicID == Intrinsic::canonicalize) + switch (IntrinsicID) { + case Intrinsic::canonicalize: + case Intrinsic::sqrt: return PoisonValue::get(Ty); + default: + break; + } } if (isa(Operands[0])) { diff --git a/llvm/test/Transforms/InstSimplify/fp-undef-poison.ll b/llvm/test/Transforms/InstSimplify/fp-undef-poison.ll index cb2026df962c8..ffab9c94ddf42 100644 --- a/llvm/test/Transforms/InstSimplify/fp-undef-poison.ll +++ b/llvm/test/Transforms/InstSimplify/fp-undef-poison.ll @@ -293,3 +293,53 @@ define double @fmul_nnan_inf_op1(double %x) { %r = fmul nnan double %x, 0xfff0000000000000 ret double %r } + +define float @sqrt_poison() { +; CHECK-LABEL: @sqrt_poison( +; CHECK-NEXT: ret float poison +; + %sqrt = call float @llvm.sqrt(float poison) + ret float %sqrt +} + +define <2 x float> @sqrt_poison_fixed_vec() { +; CHECK-LABEL: @sqrt_poison_fixed_vec( +; CHECK-NEXT: ret <2 x float> poison +; + %sqrt = call <2 x float> @llvm.sqrt(<2 x float> poison) + ret <2 x float> %sqrt +} + +define <2 x float> @sqrt_poison_elt_fixed_vec() { +; CHECK-LABEL: @sqrt_poison_elt_fixed_vec( +; CHECK-NEXT: ret <2 x float> +; + %sqrt = call <2 x float> @llvm.sqrt(<2 x float> ) + ret <2 x float> %sqrt +} + +define @sqrt_poison_scalable_vec() { +; CHECK-LABEL: @sqrt_poison_scalable_vec( +; CHECK-NEXT: ret poison +; + %sqrt = call @llvm.sqrt( poison) + ret %sqrt +} + +define float @sqrt_nnan_nan() { +; CHECK-LABEL: @sqrt_nnan_nan( +; CHECK-NEXT: [[SQRT:%.*]] = call nnan float @llvm.sqrt.f32(float 0x7FF8000000000000) +; CHECK-NEXT: ret float [[SQRT]] +; + %sqrt = call nnan float @llvm.sqrt(float 0x7ff8000000000000) + ret float %sqrt +} + +define float @sqrt_ninf_inf() { +; CHECK-LABEL: @sqrt_ninf_inf( +; CHECK-NEXT: [[SQRT:%.*]] = call ninf float @llvm.sqrt.f32(float 0xFFF0000000000000) +; CHECK-NEXT: ret float [[SQRT]] +; + %sqrt = call ninf float @llvm.sqrt(float 0xfff0000000000000) + ret float %sqrt +} From 44a7ecd1d7485be94d3a92021c650175f100d2f7 Mon Sep 17 00:00:00 2001 From: Alexander Ziaee Date: Wed, 11 Jun 2025 07:27:23 -0400 Subject: [PATCH 058/851] [doc] Use ISO nomenclature for 1024 byte units (#133148) Increase specificity by using the correct unit sizes. KBytes is an abbreviation for kB, 1000 bytes, and the hardware industry as well as several operating systems have now switched to using 1000 byte kBs. If this change is acceptable, sometimes GitHub mangles merges to use the original email of the account. $dayjob asks contributions have my work email. Thanks! --- lld/ELF/Relocations.cpp | 2 +- .../Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp | 2 +- lldb/tools/debugserver/source/RNBRemote.cpp | 2 +- llvm/lib/Target/X86/X86TargetTransformInfo.cpp | 4 ++-- openmp/tools/archer/ompt-tsan.cpp | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index 8413d8bb2437c..1af01e7247dce 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -161,7 +161,7 @@ static RelType getMipsPairType(RelType type, bool isLocal) { // symbol, the R_MIPS_GOT16 relocation creates a GOT entry to hold // the high 16 bits of the symbol's value. A paired R_MIPS_LO16 // relocations handle low 16 bits of the address. That allows - // to allocate only one GOT entry for every 64 KBytes of local data. + // to allocate only one GOT entry for every 64 KiB of local data. return isLocal ? R_MIPS_LO16 : R_MIPS_NONE; case R_MICROMIPS_GOT16: return isLocal ? R_MICROMIPS_LO16 : R_MIPS_NONE; diff --git a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp index 67ba42f33d1dd..4a1117222f34c 100644 --- a/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/GDBRemoteCommunicationServerCommon.cpp @@ -1377,7 +1377,7 @@ GDBRemoteCommunicationServerCommon::GetModuleInfo(llvm::StringRef module_path, std::vector GDBRemoteCommunicationServerCommon::HandleFeatures( const llvm::ArrayRef client_features) { - // 128KBytes is a reasonable max packet size--debugger can always use less. + // 128 KiB is a reasonable max packet size--debugger can always use less. constexpr uint32_t max_packet_size = 128 * 1024; // Features common to platform server and llgs. diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp index af3c66c71c77e..391d1c50168ea 100644 --- a/lldb/tools/debugserver/source/RNBRemote.cpp +++ b/lldb/tools/debugserver/source/RNBRemote.cpp @@ -3476,7 +3476,7 @@ static bool GetProcessNameFrom_vAttach(const char *&p, } rnb_err_t RNBRemote::HandlePacket_qSupported(const char *p) { - uint32_t max_packet_size = 128 * 1024; // 128KBytes is a reasonable max packet + uint32_t max_packet_size = 128 * 1024; // 128 KiB is a reasonable max packet // size--debugger can always use less std::stringstream reply; reply << "qXfer:features:read+;PacketSize=" << std::hex << max_packet_size diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp index 33c9edd24646b..a1a177528eb23 100644 --- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp @@ -123,7 +123,7 @@ std::optional X86TTIImpl::getCacheSize( // - Broadwell // - Skylake // - Kabylake - return 32 * 1024; // 32 KByte + return 32 * 1024; // 32 KiB case TargetTransformInfo::CacheLevel::L2D: // - Penryn // - Nehalem @@ -134,7 +134,7 @@ std::optional X86TTIImpl::getCacheSize( // - Broadwell // - Skylake // - Kabylake - return 256 * 1024; // 256 KByte + return 256 * 1024; // 256 KiB } llvm_unreachable("Unknown TargetTransformInfo::CacheLevel"); diff --git a/openmp/tools/archer/ompt-tsan.cpp b/openmp/tools/archer/ompt-tsan.cpp index bb60fc6b603f4..c315999af4328 100644 --- a/openmp/tools/archer/ompt-tsan.cpp +++ b/openmp/tools/archer/ompt-tsan.cpp @@ -1224,7 +1224,7 @@ static void ompt_tsan_finalize(ompt_data_t *tool_data) { if (archer_flags->print_max_rss) { struct rusage end; getrusage(RUSAGE_SELF, &end); - printf("MAX RSS[KBytes] during execution: %ld\n", end.ru_maxrss); + printf("MAX RSS[KiB] during execution: %ld\n", end.ru_maxrss); } if (archer_flags) From abbbe4a6cd1b83b89a834163335053863f5ffbfa Mon Sep 17 00:00:00 2001 From: Simone Pellegrini Date: Wed, 11 Jun 2025 13:37:34 +0200 Subject: [PATCH 059/851] [mlir][vector] Fix attaching write effects on transfer_write's base (#142940) This fixes an issue with `TransferWriteOp`'s implementation of the `MemoryEffectOpInterface` where the write effect was attached to the stored value rather than the base. This had the effect that when asking for the memory effects for the input memref buffer using `getEffectsOnValue(...)`, the function would return no-effects (as the effect would have been attached to the stored value rather than the input buffer). --- flang/test/HLFIR/assign-side-effects.fir | 9 +- flang/test/HLFIR/memory-effects.fir | 93 ++++++++++--------- mlir/lib/Dialect/Vector/IR/VectorOps.cpp | 2 +- .../Dialect/Bufferization/side-effects.mlir | 6 +- mlir/test/Dialect/Vector/side-effects.mlir | 15 +++ mlir/test/IR/test-side-effects.mlir | 8 +- mlir/test/lib/IR/TestSideEffects.cpp | 12 +-- 7 files changed, 81 insertions(+), 64 deletions(-) create mode 100644 mlir/test/Dialect/Vector/side-effects.mlir diff --git a/flang/test/HLFIR/assign-side-effects.fir b/flang/test/HLFIR/assign-side-effects.fir index dfd1c5886e4fa..cac9530e2277c 100644 --- a/flang/test/HLFIR/assign-side-effects.fir +++ b/flang/test/HLFIR/assign-side-effects.fir @@ -2,14 +2,14 @@ // RUN: fir-opt %s --test-side-effects --verify-diagnostics func.func @test1(%x: !fir.ref, %i: i32) { - // expected-remark @below {{found an instance of 'write' on a op operand, on resource ''}} + // expected-remark @below {{found an instance of 'write' on op operand 1, on resource ''}} hlfir.assign %i to %x : i32, !fir.ref return } func.func @test2(%x: !fir.ref, %y: !fir.ref) { - // expected-remark @below {{found an instance of 'write' on a op operand, on resource ''}} - // expected-remark @below {{found an instance of 'read' on a op operand, on resource ''}} + // expected-remark @below {{found an instance of 'write' on op operand 1, on resource ''}} + // expected-remark @below {{found an instance of 'read' on op operand 0, on resource ''}} hlfir.assign %y to %x : !fir.ref, !fir.ref return } @@ -22,7 +22,8 @@ func.func @test3(%x: !fir.ref>, %y: !fir.ref>) { } func.func @test4(%x: !fir.ref>>>, %y: !fir.box>) { - // expected-remark @below {{found an instance of 'read' on a op operand, on resource ''}} + // expected-remark @below {{found an instance of 'read' on op operand 0, on resource ''}} + // expected-remark @below {{found an instance of 'read' on op operand 1, on resource ''}} // expected-remark @below {{found an instance of 'write' on resource ''}} // expected-remark @below {{found an instance of 'free' on resource ''}} // expected-remark @below {{found an instance of 'allocate' on resource ''}} diff --git a/flang/test/HLFIR/memory-effects.fir b/flang/test/HLFIR/memory-effects.fir index cac887ebe67de..6c791f1260be7 100644 --- a/flang/test/HLFIR/memory-effects.fir +++ b/flang/test/HLFIR/memory-effects.fir @@ -3,8 +3,9 @@ func.func @concat(%arg0: !fir.ref>, %arg1: !fir.ref>) { // expected-remark@+1 {{operation has no memory effects}} %c30 = arith.constant 30 : index -// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource ''}} -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+3 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+2 {{found an instance of 'read' on op operand 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 1, on resource ''}} %0 = hlfir.concat %arg0, %arg1 len %c30 : (!fir.ref>, !fir.ref>, index) -> (!hlfir.expr>) return } @@ -16,8 +17,8 @@ func.func @all_no_effects(%arg0: !hlfir.expr<2x!fir.logical<4>>) { } func.func @all_effects(%arg0: !fir.ref>>, %arg1: i32) { -// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource ''}} -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource ''}} %all = hlfir.all %arg0 dim %arg1 : (!fir.ref>>, i32) -> !hlfir.expr> return } @@ -29,8 +30,8 @@ func.func @any_no_effects(%arg0: !hlfir.expr<2x!fir.logical<4>>) { } func.func @any_effects(%arg0: !fir.ref>>, %arg1: i32) { -// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource ''}} -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource ''}} %all = hlfir.any %arg0 dim %arg1 : (!fir.ref>>, i32) -> !hlfir.expr> return } @@ -42,7 +43,7 @@ func.func @count_no_effects(%arg0: !hlfir.expr<2x!fir.logical<4>>) { } func.func @count_effects(%arg0: !fir.ref>>, %arg1: i32) { -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource ''}} %all = hlfir.count %arg0 dim %arg1 : (!fir.ref>>, i32) -> i32 return } @@ -54,15 +55,15 @@ func.func @product_no_effects(%arg0: !hlfir.expr) { } func.func @product_effects(%arg0: !fir.ref>, %arg1: i32) { -// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource ''}} -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource ''}} %product = hlfir.product %arg0 dim %arg1 : (!fir.ref>, i32) -> !hlfir.expr<2xf32> return } func.func @set_length_read(%arg0: !fir.ref>, %arg1: index) { -// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource ''}} -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource ''}} %0 = hlfir.set_length %arg0 len %arg1 : (!fir.ref>, index) -> !hlfir.expr> return } @@ -74,8 +75,8 @@ func.func @sum_no_effects(%arg0: !hlfir.expr) { } func.func @sum_effects(%arg0: !fir.ref>, %arg1: i32) { -// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource ''}} -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource ''}} %sum = hlfir.sum %arg0 dim %arg1 : (!fir.ref>, i32) -> !hlfir.expr<2xf32> return } @@ -87,8 +88,8 @@ func.func @maxval_no_effects(%arg0: !hlfir.expr) { } func.func @maxval_effects(%arg0: !fir.ref>, %arg1: i32) { -// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource ''}} -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource ''}} %maxval = hlfir.maxval %arg0 dim %arg1 : (!fir.ref>, i32) -> !hlfir.expr<2xf32> return } @@ -100,34 +101,34 @@ func.func @minval_no_effects(%arg0: !hlfir.expr) { } func.func @minval_effects(%arg0: !fir.ref>, %arg1: i32) { -// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource ''}} -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource ''}} %minval = hlfir.minval %arg0 dim %arg1 : (!fir.ref>, i32) -> !hlfir.expr<2xf32> return } func.func @minloc_effects_simple(%arg0: !hlfir.expr) { -// expected-remark@+1 {{found an instance of 'allocate' on a op result, on resource ''}} +// expected-remark@+1 {{found an instance of 'allocate' on op result 0, on resource ''}} %minloc = hlfir.minloc %arg0 : (!hlfir.expr) -> !hlfir.expr return } func.func @minloc_effects(%arg0: !fir.ref>, %arg1: i32) { -// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource ''}} -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource ''}} %minloc = hlfir.minloc %arg0 dim %arg1 : (!fir.ref>, i32) -> !hlfir.expr<2xi32> return } func.func @maxloc_effects_simple(%arg0: !hlfir.expr) { -// expected-remark@+1 {{found an instance of 'allocate' on a op result, on resource ''}} +// expected-remark@+1 {{found an instance of 'allocate' on op result 0, on resource ''}} %maxloc = hlfir.maxloc %arg0 : (!hlfir.expr) -> !hlfir.expr return } func.func @maxloc_effects(%arg0: !fir.ref>, %arg1: i32) { -// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource ''}} -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource ''}} %maxloc = hlfir.maxloc %arg0 dim %arg1 : (!fir.ref>, i32) -> !hlfir.expr<2xi32> return } @@ -139,49 +140,49 @@ func.func @dot_product_no_effects(%arg0: !hlfir.expr, %arg1: !hlfir.expr< } func.func @dot_product_effects(%arg0: !fir.ref>, %arg1: !fir.ref>) { -// there are read effects on both arguments - the diagnostic verification just doesn't register duplicate identical diagnostics -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+2 {{found an instance of 'read' on op operand 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 1, on resource ''}} %0 = hlfir.dot_product %arg0 %arg1 : (!fir.ref>, !fir.ref>) -> f32 return } func.func @matmul_no_reads(%arg0: !hlfir.expr, %arg1: !hlfir.expr) { -// expected-remark@+1 {{found an instance of 'allocate' on a op result, on resource ''}} +// expected-remark@+1 {{found an instance of 'allocate' on op result 0, on resource ''}} %0 = hlfir.matmul %arg0 %arg1 : (!hlfir.expr, !hlfir.expr) -> !hlfir.expr return } func.func @matmul_reads(%arg0: !fir.ref>, %arg1: !fir.ref>) { -// expected-remark@+3 {{found an instance of 'allocate' on a op result, on resource ''}} -// there are read effects on both arguments - the diagnostic verification just doesn't register duplicate identical diagnostics -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+3 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+2 {{found an instance of 'read' on op operand 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 1, on resource ''}} %0 = hlfir.matmul %arg0 %arg1 : (!fir.ref>, !fir.ref>) -> !hlfir.expr<10x10xf32> return } func.func @transpose_no_reads(%arg0: !hlfir.expr) { -// expected-remark@+1 {{found an instance of 'allocate' on a op result, on resource ''}} +// expected-remark@+1 {{found an instance of 'allocate' on op result 0, on resource ''}} %0 = hlfir.transpose %arg0 : (!hlfir.expr) -> !hlfir.expr return } func.func @transpose_read(%arg0: !fir.ref>) { -// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource ''}} -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource ''}} %0 = hlfir.transpose %arg0 : (!fir.ref>) -> !hlfir.expr<5x10xf32> return } func.func @matmul_transpose_no_reads(%arg0: !hlfir.expr, %arg1: !hlfir.expr) { -// expected-remark@+1 {{found an instance of 'allocate' on a op result, on resource ''}} +// expected-remark@+1 {{found an instance of 'allocate' on op result 0, on resource ''}} %0 = hlfir.matmul_transpose %arg0 %arg1 : (!hlfir.expr, !hlfir.expr) -> !hlfir.expr return } func.func @matmul_transpose_reads(%arg0: !fir.ref>, %arg1: !fir.ref>) { -// expected-remark@+3 {{found an instance of 'allocate' on a op result, on resource ''}} -// there are read effects on both arguments - the diagnostic verification just doesn't register duplicate identical diagnostics -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+3 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+2 {{found an instance of 'read' on op operand 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 1, on resource ''}} %0 = hlfir.matmul_transpose %arg0 %arg1 : (!fir.ref>, !fir.ref>) -> !hlfir.expr<10x10xf32> return } @@ -195,8 +196,8 @@ func.func @associate(%arg0: i32) { } func.func @as_expr_read(%arg0: !fir.ref>) { -// expected-remark@+2 {{found an instance of 'allocate' on a op result, on resource ''}} -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+2 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource ''}} %0 = hlfir.as_expr %arg0 : (!fir.ref>) -> !hlfir.expr // expected-remark@+1 {{found an instance of 'free' on resource ''}} hlfir.destroy %0 : !hlfir.expr @@ -204,28 +205,28 @@ func.func @as_expr_read(%arg0: !fir.ref>) { } func.func @char_extremum(%arg0: !fir.ref>, %arg1: !fir.ref>) { -// expected-remark@+3 {{found an instance of 'allocate' on a op result, on resource ''}} -// there are read effects on both arguments - the diagnostic verification just doesn't register duplicate identical diagnostics -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+3 {{found an instance of 'allocate' on op result 0, on resource ''}} +// expected-remark@+2 {{found an instance of 'read' on op operand 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 1, on resource ''}} %0 = hlfir.char_extremum min, %arg0, %arg1 : (!fir.ref>, !fir.ref>) -> !hlfir.expr> return } func.func @copy_in(%box: !fir.box>, %temp: !fir.ref>>>, %is_present: i1) { // expected-remark@+3 {{found an instance of 'allocate' on resource ''}} -// expected-remark@+2 {{found an instance of 'read' on a op operand, on resource ''}} -// expected-remark@+1 {{found an instance of 'write' on a op operand, on resource ''}} +// expected-remark@+2 {{found an instance of 'read' on op operand 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'write' on op operand 1, on resource ''}} %0:2 = hlfir.copy_in %box to %temp : (!fir.box>, !fir.ref>>>) -> (!fir.box>, i1) return } func.func @copy_out(%box: !fir.box>, %temp: !fir.ref>>>, %was_copied: i1) { // expected-remark@+2 {{found an instance of 'free' on resource ''}} -// expected-remark@+1 {{found an instance of 'read' on a op operand, on resource ''}} +// expected-remark@+1 {{found an instance of 'read' on op operand 0, on resource ''}} hlfir.copy_out %temp, %was_copied : (!fir.ref>>>, i1) -> () // expected-remark@+3 {{found an instance of 'free' on resource ''}} -// expected-remark@+2 {{found an instance of 'read' on a op operand, on resource ''}} -// expected-remark@+1 {{found an instance of 'write' on a op operand, on resource ''}} +// expected-remark@+2 {{found an instance of 'read' on op operand 0, on resource ''}} +// expected-remark@+1 {{found an instance of 'write' on op operand 2, on resource ''}} hlfir.copy_out %temp, %was_copied to %box : (!fir.ref>>>, i1, !fir.box>) -> () return } diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp index 3179b4f975404..a295bf1eb4d95 100644 --- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp +++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp @@ -5038,7 +5038,7 @@ void TransferWriteOp::getEffects( SmallVectorImpl> &effects) { if (llvm::isa(getShapedType())) - effects.emplace_back(MemoryEffects::Write::get(), &getValueToStoreMutable(), + effects.emplace_back(MemoryEffects::Write::get(), &getBaseMutable(), SideEffects::DefaultResource::get()); } diff --git a/mlir/test/Dialect/Bufferization/side-effects.mlir b/mlir/test/Dialect/Bufferization/side-effects.mlir index 841490e9f3234..129fc8b32c270 100644 --- a/mlir/test/Dialect/Bufferization/side-effects.mlir +++ b/mlir/test/Dialect/Bufferization/side-effects.mlir @@ -1,9 +1,9 @@ // RUN: mlir-opt %s --test-side-effects --verify-diagnostics func.func @test_side_effects(%arg0: memref<2xi32>) -> memref<2xi32> { - // expected-remark @below {{found an instance of 'read' on a op operand, on resource ''}} - // expected-remark @below {{found an instance of 'write' on a op result, on resource ''}} - // expected-remark @below {{found an instance of 'allocate' on a op result, on resource ''}} + // expected-remark @below {{found an instance of 'read' on op operand 0, on resource ''}} + // expected-remark @below {{found an instance of 'write' on op result 0, on resource ''}} + // expected-remark @below {{found an instance of 'allocate' on op result 0, on resource ''}} %0 = bufferization.clone %arg0 : memref<2xi32> to memref<2xi32> return %0 : memref<2xi32> } diff --git a/mlir/test/Dialect/Vector/side-effects.mlir b/mlir/test/Dialect/Vector/side-effects.mlir new file mode 100644 index 0000000000000..54c274a1a2a02 --- /dev/null +++ b/mlir/test/Dialect/Vector/side-effects.mlir @@ -0,0 +1,15 @@ +// RUN: mlir-opt %s --test-side-effects --verify-diagnostics + +func.func @test_side_effects(%arg0: memref<8xf32>) { + // expected-remark @below {{operation has no memory effects}} + %c0 = arith.constant 0 : index + // expected-remark @below {{operation has no memory effects}} + %c4 = arith.constant 4 : index + // expected-remark @below {{operation has no memory effects}} + %cst = arith.constant 0.0 : f32 + // expected-remark @below {{found an instance of 'read' on op operand 0, on resource ''}} + %0 = vector.transfer_read %arg0[%c0], %cst : memref<8xf32>, vector<4xf32> + // expected-remark @below {{found an instance of 'write' on op operand 1, on resource ''}} + vector.transfer_write %0, %arg0[%c4] : vector<4xf32>, memref<8xf32> + return +} diff --git a/mlir/test/IR/test-side-effects.mlir b/mlir/test/IR/test-side-effects.mlir index efce4856041a1..b652ecb7dad1d 100644 --- a/mlir/test/IR/test-side-effects.mlir +++ b/mlir/test/IR/test-side-effects.mlir @@ -15,7 +15,7 @@ func.func @side_effect(%arg : index) { {effect="write", test_resource} ]} : () -> i32 - // expected-remark@+1 {{found an instance of 'allocate' on a op result, on resource ''}} + // expected-remark@+1 {{found an instance of 'allocate' on op result 0, on resource ''}} %3 = "test.side_effect_op"() {effects = [ {effect="allocate", on_result, test_resource} ]} : () -> i32 @@ -38,19 +38,19 @@ func.func @side_effect(%arg : index) { effect_parameter = affine_map<(i, j) -> (j, i)> } : () -> i32 - // expected-remark@+1 {{found an instance of 'allocate' on a op operand, on resource ''}} + // expected-remark@+1 {{found an instance of 'allocate' on op operand 0, on resource ''}} %6 = test.side_effect_with_region_op (%arg) { ^bb0(%arg0 : index): test.region_yield %arg0 : index } {effects = [ {effect="allocate", on_operand, test_resource} ]} : index -> index - // expected-remark@+1 {{found an instance of 'allocate' on a op result, on resource ''}} + // expected-remark@+1 {{found an instance of 'allocate' on op result 0, on resource ''}} %7 = test.side_effect_with_region_op (%arg) { ^bb0(%arg0 : index): test.region_yield %arg0 : index } {effects = [ {effect="allocate", on_result, test_resource} ]} : index -> index - // expected-remark@+1 {{found an instance of 'allocate' on a block argument, on resource ''}} + // expected-remark@+1 {{found an instance of 'allocate' on block argument 0, on resource ''}} %8 = test.side_effect_with_region_op (%arg) { ^bb0(%arg0 : index): test.region_yield %arg0 : index diff --git a/mlir/test/lib/IR/TestSideEffects.cpp b/mlir/test/lib/IR/TestSideEffects.cpp index 7e01509d55685..000e7c204fd5f 100644 --- a/mlir/test/lib/IR/TestSideEffects.cpp +++ b/mlir/test/lib/IR/TestSideEffects.cpp @@ -52,12 +52,12 @@ struct SideEffectsPass diag << "'write'"; if (instance.getValue()) { - if (instance.getEffectValue()) - diag << " on a op operand,"; - else if (instance.getEffectValue()) - diag << " on a op result,"; - else if (instance.getEffectValue()) - diag << " on a block argument,"; + if (auto *opOpd = instance.getEffectValue()) + diag << " on op operand " << opOpd->getOperandNumber() << ","; + else if (auto opRes = instance.getEffectValue()) + diag << " on op result " << opRes.getResultNumber() << ","; + else if (auto opBlk = instance.getEffectValue()) + diag << " on block argument " << opBlk.getArgNumber() << ","; } else if (SymbolRefAttr symbolRef = instance.getSymbolRef()) diag << " on a symbol '" << symbolRef << "',"; From 2dd88c405d77b34dc028af09f3d55fa10dbed50e Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Wed, 11 Jun 2025 13:44:01 +0200 Subject: [PATCH 060/851] [flang][OpenMP] Extend locality spec to OMP claues (`init` and `dealloc` regions) (#142795) Extends support for locality specifier to OpenMP translation by adding supprot for transling localizers that have `init` and `dealloc` regions. --- .../OpenMP/DoConcurrentConversion.cpp | 29 +++++++++-- .../locality_specifiers_init_dealloc.mlir | 51 +++++++++++++++++++ 2 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp index 283c3052c166c..28f6c8bf02813 100644 --- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp +++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp @@ -326,16 +326,37 @@ class DoConcurrentConversion TODO(localizer.getLoc(), "local_init conversion is not supported yet"); - if (!localizer.getInitRegion().empty()) - TODO(localizer.getLoc(), - "non-empty `init` regions are not supported yet"); - auto oldIP = rewriter.saveInsertionPoint(); rewriter.setInsertionPointAfter(localizer); auto privatizer = rewriter.create( localizer.getLoc(), sym.getLeafReference().str() + ".omp", localizer.getTypeAttr().getValue(), mlir::omp::DataSharingClauseType::Private); + + if (!localizer.getInitRegion().empty()) { + rewriter.cloneRegionBefore(localizer.getInitRegion(), + privatizer.getInitRegion(), + privatizer.getInitRegion().begin()); + auto firYield = mlir::cast( + privatizer.getInitRegion().back().getTerminator()); + rewriter.setInsertionPoint(firYield); + rewriter.create(firYield.getLoc(), + firYield.getOperands()); + rewriter.eraseOp(firYield); + } + + if (!localizer.getDeallocRegion().empty()) { + rewriter.cloneRegionBefore(localizer.getDeallocRegion(), + privatizer.getDeallocRegion(), + privatizer.getDeallocRegion().begin()); + auto firYield = mlir::cast( + privatizer.getDeallocRegion().back().getTerminator()); + rewriter.setInsertionPoint(firYield); + rewriter.create(firYield.getLoc(), + firYield.getOperands()); + rewriter.eraseOp(firYield); + } + rewriter.restoreInsertionPoint(oldIP); wsloopClauseOps.privateVars.push_back(op); diff --git a/flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir b/flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir new file mode 100644 index 0000000000000..1659c7bdf6d3e --- /dev/null +++ b/flang/test/Transforms/DoConcurrent/locality_specifiers_init_dealloc.mlir @@ -0,0 +1,51 @@ +// Tests mapping `local` locality specifier to `private` clauses for non-empty +// `init` and `dealloc` regions. + +// RUN: fir-opt --omp-do-concurrent-conversion="map-to=host" %s | FileCheck %s + +func.func @my_allocator(%arg0: !fir.ref>>, %arg1: !fir.ref>>) { + return +} + +func.func @my_deallocator(%arg0: !fir.ref>>) { + return +} + +fir.local {type = local} @_QFlocal_assocEaa_private_box_10xf32 : !fir.box> init { +^bb0(%arg0: !fir.ref>>, %arg1: !fir.ref>>): + fir.call @my_allocator(%arg0, %arg1) : (!fir.ref>>, !fir.ref>>) -> () + fir.yield(%arg1 : !fir.ref>>) +} dealloc { +^bb0(%arg0: !fir.ref>>): + fir.call @my_deallocator(%arg0) : (!fir.ref>>) -> () + fir.yield +} + +func.func @_QPlocal_assoc() { + %0 = fir.alloca !fir.box> + %c1 = arith.constant 1 : index + + fir.do_concurrent { + %9 = fir.alloca i32 {bindc_name = "i"} + %10:2 = hlfir.declare %9 {uniq_name = "_QFlocal_assocEi"} : (!fir.ref) -> (!fir.ref, !fir.ref) + fir.do_concurrent.loop (%arg0) = (%c1) to (%c1) step (%c1) local(@_QFlocal_assocEaa_private_box_10xf32 %0 -> %arg1 : !fir.ref>>) { + %11 = fir.convert %arg0 : (index) -> i32 + fir.store %11 to %10#0 : !fir.ref + } + } + + return +} + +// CHECK: omp.private {type = private} @[[PRIVATIZER:.*]] : !fir.box> init { +// CHECK-NEXT: ^bb0(%[[ORIG_ARG:.*]]: !{{.*}}, %[[PRIV_ARG:.*]]: !{{.*}}): +// CHECK-NEXT: fir.call @my_allocator(%[[ORIG_ARG]], %[[PRIV_ARG]]) : ({{.*}}) -> () +// CHECK-NEXT: omp.yield(%[[PRIV_ARG]] : {{.*}}) +// CHECK-NEXT: } dealloc { +// CHECK-NEXT: ^bb0(%[[PRIV_ARG:.*]]: !{{.*}}): +// CHECK-NEXT: fir.call @my_deallocator(%[[PRIV_ARG]]) : ({{.*}}) -> () +// CHECK-NEXT: omp.yield +// CHECK-NEXT: } + +// CHECK: %[[LOCAL_ALLOC:.*]] = fir.alloca !fir.box> +// CHECK: omp.wsloop private(@[[PRIVATIZER]] %[[LOCAL_ALLOC]] -> %{{.*}} : !{{.*}}) From 756e7cfd86c7f2bf20aaa1a3f87b5aa72ec128b4 Mon Sep 17 00:00:00 2001 From: Adrian Vogelsgesang Date: Wed, 11 Jun 2025 13:50:32 +0200 Subject: [PATCH 061/851] [debuginfo][coro] Fix linkage name for clones of coro functions (#141889) So far, the `DW_AT_linkage_name` of the coroutine `resume`, `destroy`, `cleanup` and `noalloc` function clones were incorrectly set to the original function name instead of the updated function names. With this commit, we now update the `DW_AT_linkage_name` to the correct name. This has multiple benefits: 1. it's easier for me (and other toolchain developers) to understand the output of `llvm-dwarf-dump` when coroutines are involved. 2. When hitting a breakpoint, both LLDB and GDB now tell you which clone of the function you are in. E.g., GDB now prints "Breakpoint 1.2, coro_func(int) [clone .resume] (v=43) at ..." instead of "Breakpoint 1.2, coro_func(int) (v=43) at ...". 3. GDB's `info line coro_func` command now allows you to distinguish the multiple different clones of the function. In Swift, the linkage names of the clones were already updated. The comment right above the relevant code in `CoroSplit.cpp` already hinted that the linkage name should probably also be updated in C++. This comment was added in commit 6ce76ff7eb7640, and back then the corresponding `DW_AT_specification` (i.e., `SP->getDeclaration()`) was not updated, yet, which led to problems for C++. In the meantime, commit ca1a5b37c7236d added code to also update `SP->getDeclaration`, as such there is no reason anymore to not update the linkage name for C++. Note that most test cases used inconsistent function names for the LLVM function vs. the DISubprogram linkage name. clang would never emit such LLVM IR. This confused me initially, and hence I fixed it while updating the test case. Drive-by fix: The change in `CGVTables.cpp` is purely stylistic, NFC. When looking for other usages of `replaceWithDistinct`, I got initially confused because `CGVTables.cpp` was calling a static function via an object instance. --- clang/lib/CodeGen/CGVTables.cpp | 2 +- .../coroutine_handle/TestCoroutineHandle.py | 5 ++- llvm/lib/Transforms/Coroutines/CoroSplit.cpp | 31 +++++-------------- ...coro-debug-dbg.values-not_used_in_frame.ll | 6 ++-- .../Coroutines/coro-debug-dbg.values.ll | 10 +++--- .../Coroutines/coro-debug-frame-variable.ll | 10 +++--- llvm/test/Transforms/Coroutines/coro-debug.ll | 18 +++++------ 7 files changed, 35 insertions(+), 47 deletions(-) diff --git a/clang/lib/CodeGen/CGVTables.cpp b/clang/lib/CodeGen/CGVTables.cpp index c7447273a42fa..2897ccdf88660 100644 --- a/clang/lib/CodeGen/CGVTables.cpp +++ b/clang/lib/CodeGen/CGVTables.cpp @@ -124,7 +124,7 @@ static void resolveTopLevelMetadata(llvm::Function *Fn, auto *DIS = Fn->getSubprogram(); if (!DIS) return; - auto *NewDIS = DIS->replaceWithDistinct(DIS->clone()); + auto *NewDIS = llvm::MDNode::replaceWithDistinct(DIS->clone()); VMap.MD()[DIS].reset(NewDIS); // Find all llvm.dbg.declare intrinsics and resolve the DILocalVariable nodes diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py index ae1a0c86b45d8..f471ea728f953 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py @@ -18,8 +18,11 @@ def do_test(self, stdlib_type): self.build(dictionary={stdlib_type: "1"}) is_clang = self.expectedCompiler(["clang"]) + # Clang <= 20 used to also name the resume/destroy functions + # as `my_generator_func`. + # Never versions of clang name the clones as `.resume`/`.destroy`. test_generator_func_ptr_re = re.compile( - r"^\(a.out`my_generator_func\(\) at main.cpp:[0-9]*\)$" + r"^\(a.out`my_generator_func\(\)( \(\..*\))? at main.cpp:[0-9]*\)$" ) # Run until the initial suspension point diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp index cebe44581b061..8813f91e9060c 100644 --- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp +++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp @@ -913,29 +913,14 @@ void coro::BaseCloner::create() { assert(SP != OrigF.getSubprogram() && SP->isDistinct()); updateScopeLine(ActiveSuspend, *SP); - // Update the linkage name to reflect the modified symbol name. It - // is necessary to update the linkage name in Swift, since the - // mangling changes for resume functions. It might also be the - // right thing to do in C++, but due to a limitation in LLVM's - // AsmPrinter we can only do this if the function doesn't have an - // abstract specification, since the DWARF backend expects the - // abstract specification to contain the linkage name and asserts - // that they are identical. - if (SP->getUnit() && - SP->getUnit()->getSourceLanguage() == dwarf::DW_LANG_Swift) { - SP->replaceLinkageName(MDString::get(Context, NewF->getName())); - if (auto *Decl = SP->getDeclaration()) { - auto *NewDecl = DISubprogram::get( - Decl->getContext(), Decl->getScope(), Decl->getName(), - NewF->getName(), Decl->getFile(), Decl->getLine(), Decl->getType(), - Decl->getScopeLine(), Decl->getContainingType(), - Decl->getVirtualIndex(), Decl->getThisAdjustment(), - Decl->getFlags(), Decl->getSPFlags(), Decl->getUnit(), - Decl->getTemplateParams(), nullptr, Decl->getRetainedNodes(), - Decl->getThrownTypes(), Decl->getAnnotations(), - Decl->getTargetFuncName()); - SP->replaceDeclaration(NewDecl); - } + // Update the linkage name and the function name to reflect the modified + // name. + MDString *NewLinkageName = MDString::get(Context, NewF->getName()); + SP->replaceLinkageName(NewLinkageName); + if (DISubprogram *Decl = SP->getDeclaration()) { + TempDISubprogram NewDecl = Decl->clone(); + NewDecl->replaceLinkageName(NewLinkageName); + SP->replaceDeclaration(MDNode::replaceWithUniqued(std::move(NewDecl))); } } diff --git a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll index 4da07c91eb486..deaec7b8d7f89 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values-not_used_in_frame.ll @@ -2,18 +2,18 @@ ; RUN: opt < %s -passes='module(coro-early),cgscc(coro-split,coro-split)' -S | FileCheck %s ; ; This file is based on coro-debug-frame-variable.ll. -; CHECK: define internal fastcc void @f.resume(ptr noundef nonnull align 16 dereferenceable(80) %begin) !dbg ![[RESUME_FN_DBG_NUM:[0-9]+]] +; CHECK: define internal fastcc void @_Z3foov.resume(ptr noundef nonnull align 16 dereferenceable(80) %begin) !dbg ![[RESUME_FN_DBG_NUM:[0-9]+]] ; CHECK: await.ready: ; CHECK: #dbg_value(i32 poison, ![[IVAR_RESUME:[0-9]+]], !DIExpression( ; CHECK: #dbg_value(i32 poison, ![[JVAR_RESUME:[0-9]+]], !DIExpression( ; -; CHECK: ![[RESUME_FN_DBG_NUM]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov" +; CHECK: ![[RESUME_FN_DBG_NUM]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov.resume" ; CHECK: ![[IVAR_RESUME]] = !DILocalVariable(name: "i" ; CHECK: ![[JVAR_RESUME]] = !DILocalVariable(name: "j" source_filename = "../llvm/test/Transforms/Coroutines/coro-debug-dbg.values-O2.ll" -define void @f(i32 %i, i32 %j) presplitcoroutine !dbg !8 { +define void @_Z3foov(i32 %i, i32 %j) presplitcoroutine !dbg !8 { entry: %__promise = alloca i8, align 8 %x = alloca [10 x i32], align 16 diff --git a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll index 28592cc671062..5f7701c357ec3 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-dbg.values.ll @@ -2,7 +2,7 @@ ; RUN: opt < %s -passes='module(coro-early),cgscc(coro-split,coro-split)' -S | FileCheck %s ; ; This file is based on coro-debug-frame-variable.ll. -; CHECK-LABEL: define void @f( +; CHECK-LABEL: define void @_Z3foov( ; CHECK: %[[frame:.*]] = call {{.*}} @llvm.coro.begin ; CHECK: #dbg_value(ptr %[[frame]] ; CHECK-SAME: !DIExpression(DW_OP_plus_uconst, [[OffsetX:[0-9]*]]), @@ -20,7 +20,7 @@ ; CHECK: #dbg_value(ptr %[[frame]] ; CHECK-SAME: !DIExpression(DW_OP_plus_uconst, [[OffsetJ:[0-9]*]], DW_OP_deref), -; CHECK-LABEL: void @f.resume( +; CHECK-LABEL: void @_Z3foov.resume( ; CHECK-SAME: ptr {{.*}} %[[frame:.*]]) ; CHECK-SAME: !dbg ![[RESUME_FN_DBG_NUM:[0-9]+]] ; CHECK: %[[frame_alloca:.*]] = alloca ptr @@ -37,7 +37,7 @@ ; CHECK: #dbg_value(ptr %[[frame_alloca]], ![[JVAR_RESUME:[0-9]+]], ; CHECK-SAME: !DIExpression(DW_OP_deref, DW_OP_plus_uconst, [[OffsetJ]], DW_OP_deref) ; -; CHECK: ![[RESUME_FN_DBG_NUM]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov" +; CHECK: ![[RESUME_FN_DBG_NUM]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov.resume" ; CHECK: ![[FRAME_DI_NUM]] = !DILocalVariable(name: "__coro_frame" ; CHECK: ![[IVAR_RESUME]] = !DILocalVariable(name: "i" ; CHECK: ![[XVAR_RESUME]] = !DILocalVariable(name: "x" @@ -46,7 +46,7 @@ declare void @consume(i32) -define void @f(i32 %i, i32 %j) presplitcoroutine !dbg !8 { +define void @_Z3foov(i32 %i, i32 %j) presplitcoroutine !dbg !8 { entry: %__promise = alloca i8, align 8 %x = alloca [10 x i32], align 16 @@ -257,4 +257,4 @@ attributes #4 = { argmemonly nofree nosync nounwind willreturn writeonly } !21 = !DILocation(line: 43, column: 3, scope: !7) !22 = !DILocation(line: 43, column: 8, scope: !7) !23 = !DILocalVariable(name: "produced", scope: !7, file: !1, line:24, type: !10) -!30 = distinct !DIAssignID() \ No newline at end of file +!30 = distinct !DIAssignID() diff --git a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll index a3c62b2dd12e1..125ec752c8345 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug-frame-variable.ll @@ -23,11 +23,11 @@ ; ; The CHECKs verify that dbg.declare intrinsics are created for the coroutine ; funclet 'f.resume', and that they reference the address of the variables on -; the coroutine frame. The debug locations for the original function 'f' are +; the coroutine frame. The debug locations for the original function 'foo' are ; static (!11 and !13), whereas the coroutine funclet will have its own new ; ones with identical line and column numbers. ; -; CHECK-LABEL: define void @f() {{.*}} { +; CHECK-LABEL: define void @_Z3foov() {{.*}} { ; CHECK: entry: ; CHECK: %j = alloca i32, align 4 ; CHECK: #dbg_declare(ptr %j, ![[JVAR:[0-9]+]], !DIExpression(), ![[JDBGLOC:[0-9]+]] @@ -36,7 +36,7 @@ ; CHECK: #dbg_declare(ptr %[[MEMORY]], ![[IVAR:[0-9]+]], !DIExpression(DW_OP_plus_uconst, 20), ![[IDBGLOC]] ; CHECK: await.ready: ; -; CHECK-LABEL: define internal fastcc void @f.resume({{.*}}) {{.*}} { +; CHECK-LABEL: define internal fastcc void @_Z3foov.resume({{.*}}) {{.*}} { ; CHECK: entry.resume: ; CHECK-NEXT: %[[DBG_PTR:.*]] = alloca ptr ; CHECK-NEXT: #dbg_declare(ptr %[[DBG_PTR]], ![[XVAR_RESUME:[0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst, 32), @@ -58,13 +58,13 @@ ; CHECK-DAG: ![[JDBGLOC]] = !DILocation(line: 32, column: 7, scope: ![[BLK_SCOPE]]) ; CHECK-DAG: ![[XVAR_RESUME]] = !DILocalVariable(name: "x" -; CHECK-DAG: ![[RESUME_PROG_SCOPE:[0-9]+]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov" +; CHECK-DAG: ![[RESUME_PROG_SCOPE:[0-9]+]] = distinct !DISubprogram(name: "foo", linkageName: "_Z3foov.resume" ; CHECK-DAG: ![[IDBGLOC_RESUME]] = !DILocation(line: 24, column: 7, scope: ![[RESUME_BLK_SCOPE:[0-9]+]]) ; CHECK-DAG: ![[RESUME_BLK_SCOPE]] = distinct !DILexicalBlock(scope: ![[RESUME_PROG_SCOPE]], file: !1, line: 23, column: 12) ; CHECK-DAG: ![[IVAR_RESUME]] = !DILocalVariable(name: "i" ; CHECK-DAG: ![[JVAR_RESUME]] = !DILocalVariable(name: "j" ; CHECK-DAG: ![[JDBGLOC_RESUME]] = !DILocation(line: 32, column: 7, scope: ![[RESUME_BLK_SCOPE]]) -define void @f() presplitcoroutine !dbg !8 { +define void @_Z3foov() presplitcoroutine !dbg !8 { entry: %__promise = alloca i8, align 8 %i = alloca i32, align 4 diff --git a/llvm/test/Transforms/Coroutines/coro-debug.ll b/llvm/test/Transforms/Coroutines/coro-debug.ll index 17a0b80c5b5e5..a220073248ba3 100644 --- a/llvm/test/Transforms/Coroutines/coro-debug.ll +++ b/llvm/test/Transforms/Coroutines/coro-debug.ll @@ -6,12 +6,12 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-unknown-linux-gnu" ; Function Attrs: noinline nounwind -define ptr @f(i32 %x) #0 personality i32 0 !dbg !6 { +define ptr @flink(i32 %x) #0 personality i32 0 !dbg !6 { entry: %x.addr = alloca i32, align 4 %coro_hdl = alloca ptr, align 8 store i32 %x, ptr %x.addr, align 4 - %0 = call token @llvm.coro.id(i32 0, ptr null, ptr @f, ptr null), !dbg !16 + %0 = call token @llvm.coro.id(i32 0, ptr null, ptr @flink, ptr null), !dbg !16 %1 = call i64 @llvm.coro.size.i64(), !dbg !16 %call = call ptr @malloc(i64 %1), !dbg !16 %2 = call ptr @llvm.coro.begin(token %0, ptr %call) #7, !dbg !16 @@ -170,8 +170,8 @@ attributes #7 = { noduplicate } !31 = !DILocalVariable(name: "allocated", scope: !6, file: !7, line: 55, type: !11) !32 = !DILocalVariable(name: "inline_asm", scope: !6, file: !7, line: 55, type: !11) -; CHECK: define ptr @f(i32 %x) #0 personality i32 0 !dbg ![[ORIG:[0-9]+]] -; CHECK: define internal fastcc void @f.resume(ptr noundef nonnull align 8 dereferenceable(40) %0) #0 personality i32 0 !dbg ![[RESUME:[0-9]+]] +; CHECK: define ptr @flink(i32 %x) #0 personality i32 0 !dbg ![[ORIG:[0-9]+]] +; CHECK: define internal fastcc void @flink.resume(ptr noundef nonnull align 8 dereferenceable(40) %0) #0 personality i32 0 !dbg ![[RESUME:[0-9]+]] ; CHECK: entry.resume: ; CHECK: %[[DBG_PTR:.*]] = alloca ptr ; CHECK: #dbg_declare(ptr %[[DBG_PTR]], ![[RESUME_COROHDL:[0-9]+]], !DIExpression(DW_OP_deref, DW_OP_plus_uconst, @@ -194,18 +194,18 @@ attributes #7 = { noduplicate } ; CHECK: [[DEFAULT_DEST]]: ; CHECK-NOT: {{.*}}: ; CHECK: #dbg_value(i32 %[[CALLBR_RES]] -; CHECK: define internal fastcc void @f.destroy(ptr noundef nonnull align 8 dereferenceable(40) %0) #0 personality i32 0 !dbg ![[DESTROY:[0-9]+]] -; CHECK: define internal fastcc void @f.cleanup(ptr noundef nonnull align 8 dereferenceable(40) %0) #0 personality i32 0 !dbg ![[CLEANUP:[0-9]+]] +; CHECK: define internal fastcc void @flink.destroy(ptr noundef nonnull align 8 dereferenceable(40) %0) #0 personality i32 0 !dbg ![[DESTROY:[0-9]+]] +; CHECK: define internal fastcc void @flink.cleanup(ptr noundef nonnull align 8 dereferenceable(40) %0) #0 personality i32 0 !dbg ![[CLEANUP:[0-9]+]] ; CHECK: ![[ORIG]] = distinct !DISubprogram(name: "f", linkageName: "flink" -; CHECK: ![[RESUME]] = distinct !DISubprogram(name: "f", linkageName: "flink" +; CHECK: ![[RESUME]] = distinct !DISubprogram(name: "f", linkageName: "flink.resume" ; CHECK: ![[RESUME_COROHDL]] = !DILocalVariable(name: "coro_hdl", scope: ![[RESUME]] ; CHECK: ![[RESUME_X]] = !DILocalVariable(name: "x", arg: 1, scope: ![[RESUME]] ; CHECK: ![[RESUME_CONST]] = !DILocalVariable(name: "direct_const", scope: ![[RESUME]] ; CHECK: ![[RESUME_DIRECT]] = !DILocalVariable(name: "direct_mem", scope: ![[RESUME]] ; CHECK: ![[RESUME_DIRECT_VALUE]] = !DILocalVariable(name: "direct_value", scope: ![[RESUME]] -; CHECK: ![[DESTROY]] = distinct !DISubprogram(name: "f", linkageName: "flink" +; CHECK: ![[DESTROY]] = distinct !DISubprogram(name: "f", linkageName: "flink.destroy" -; CHECK: ![[CLEANUP]] = distinct !DISubprogram(name: "f", linkageName: "flink" +; CHECK: ![[CLEANUP]] = distinct !DISubprogram(name: "f", linkageName: "flink.cleanup" From f44f411afa914107d0a2395d2d8db826f88205e5 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 11 Jun 2025 21:05:42 +0900 Subject: [PATCH 062/851] MSP430: Add tests for fcmp (#142706) The existing coverage is thin. libcalls.ll seems to be the main fcmp test, and it doesn't cover all the condition types, and runs with -O0. Test all conditions for f32 and f64 --- llvm/test/CodeGen/MSP430/fcmp.ll | 761 +++++++++++++++++++++++++++++++ 1 file changed, 761 insertions(+) create mode 100644 llvm/test/CodeGen/MSP430/fcmp.ll diff --git a/llvm/test/CodeGen/MSP430/fcmp.ll b/llvm/test/CodeGen/MSP430/fcmp.ll new file mode 100644 index 0000000000000..df1edc61b3370 --- /dev/null +++ b/llvm/test/CodeGen/MSP430/fcmp.ll @@ -0,0 +1,761 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=msp430-unknown-unknown < %s | FileCheck %s + +define i1 @fcmp_false_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_false_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: clr.b r12 +; CHECK-NEXT: ret + %cmp = fcmp false double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_oeq_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_oeq_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: mov r15, r11 +; CHECK-NEXT: mov r14, r10 +; CHECK-NEXT: mov r13, r9 +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov 8(r1), r12 +; CHECK-NEXT: mov 10(r1), r13 +; CHECK-NEXT: mov 12(r1), r14 +; CHECK-NEXT: mov 14(r1), r15 +; CHECK-NEXT: call #__mspabi_cmpd +; CHECK-NEXT: tst r12 +; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: and #1, r12 +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: ret + %cmp = fcmp oeq double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_ogt_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_ogt_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: mov r15, r11 +; CHECK-NEXT: mov r14, r10 +; CHECK-NEXT: mov r13, r9 +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov 8(r1), r12 +; CHECK-NEXT: mov 10(r1), r13 +; CHECK-NEXT: mov 12(r1), r14 +; CHECK-NEXT: mov 14(r1), r15 +; CHECK-NEXT: call #__mspabi_cmpd +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: cmp #1, r13 +; CHECK-NEXT: jge .LBB2_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB2_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: ret + %cmp = fcmp ogt double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_oge_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_oge_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: mov r15, r11 +; CHECK-NEXT: mov r14, r10 +; CHECK-NEXT: mov r13, r9 +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov 8(r1), r12 +; CHECK-NEXT: mov 10(r1), r13 +; CHECK-NEXT: mov 12(r1), r14 +; CHECK-NEXT: mov 14(r1), r15 +; CHECK-NEXT: call #__mspabi_cmpd +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: tst r13 +; CHECK-NEXT: jge .LBB3_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: ret + %cmp = fcmp oge double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_olt_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_olt_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: mov r15, r11 +; CHECK-NEXT: mov r14, r10 +; CHECK-NEXT: mov r13, r9 +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov 8(r1), r12 +; CHECK-NEXT: mov 10(r1), r13 +; CHECK-NEXT: mov 12(r1), r14 +; CHECK-NEXT: mov 14(r1), r15 +; CHECK-NEXT: call #__mspabi_cmpd +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: tst r13 +; CHECK-NEXT: jl .LBB4_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB4_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: ret + %cmp = fcmp olt double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_ole_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_ole_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: mov r15, r11 +; CHECK-NEXT: mov r14, r10 +; CHECK-NEXT: mov r13, r9 +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov 8(r1), r12 +; CHECK-NEXT: mov 10(r1), r13 +; CHECK-NEXT: mov 12(r1), r14 +; CHECK-NEXT: mov 14(r1), r15 +; CHECK-NEXT: call #__mspabi_cmpd +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: cmp #1, r13 +; CHECK-NEXT: jl .LBB5_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB5_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: ret + %cmp = fcmp ole double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_one_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_one_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: push r4 +; CHECK-NEXT: push r5 +; CHECK-NEXT: push r6 +; CHECK-NEXT: push r7 +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: sub #8, r1 +; CHECK-NEXT: mov r15, r7 +; CHECK-NEXT: mov r14, r10 +; CHECK-NEXT: mov r13, r9 +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov 24(r1), r12 +; CHECK-NEXT: mov 26(r1), r5 +; CHECK-NEXT: mov 28(r1), r4 +; CHECK-NEXT: mov 30(r1), r6 +; CHECK-NEXT: mov r7, r11 +; CHECK-NEXT: mov r5, r13 +; CHECK-NEXT: mov r4, r14 +; CHECK-NEXT: mov r6, r15 +; CHECK-NEXT: call #__mspabi_cmpd +; CHECK-NEXT: mov r6, 6(r1) +; CHECK-NEXT: mov r4, 4(r1) +; CHECK-NEXT: mov r5, 2(r1) +; CHECK-NEXT: mov 24(r1), r13 +; CHECK-NEXT: mov r13, 0(r1) +; CHECK-NEXT: tst r12 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: mov r9, r13 +; CHECK-NEXT: mov r10, r14 +; CHECK-NEXT: mov r7, r15 +; CHECK-NEXT: call #__unorddf2 +; CHECK-NEXT: tst r12 +; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: rra r6 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: bic r6, r12 +; CHECK-NEXT: and #1, r12 +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: add #8, r1 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: pop r7 +; CHECK-NEXT: pop r6 +; CHECK-NEXT: pop r5 +; CHECK-NEXT: pop r4 +; CHECK-NEXT: ret + %cmp = fcmp one double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_ord_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_ord_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub #8, r1 +; CHECK-NEXT: mov 16(r1), 6(r1) +; CHECK-NEXT: mov 14(r1), 4(r1) +; CHECK-NEXT: mov 12(r1), 2(r1) +; CHECK-NEXT: mov 10(r1), 0(r1) +; CHECK-NEXT: call #__unorddf2 +; CHECK-NEXT: tst r12 +; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: and #1, r12 +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: add #8, r1 +; CHECK-NEXT: ret + %cmp = fcmp ord double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_uno_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_uno_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: sub #8, r1 +; CHECK-NEXT: mov 16(r1), 6(r1) +; CHECK-NEXT: mov 14(r1), 4(r1) +; CHECK-NEXT: mov 12(r1), 2(r1) +; CHECK-NEXT: mov 10(r1), 0(r1) +; CHECK-NEXT: call #__unorddf2 +; CHECK-NEXT: tst r12 +; CHECK-NEXT: mov r2, r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: bic r13, r12 +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: add #8, r1 +; CHECK-NEXT: ret + %cmp = fcmp uno double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_ueq_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_ueq_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: push r4 +; CHECK-NEXT: push r5 +; CHECK-NEXT: push r6 +; CHECK-NEXT: push r7 +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: sub #8, r1 +; CHECK-NEXT: mov r15, r7 +; CHECK-NEXT: mov r14, r10 +; CHECK-NEXT: mov r13, r9 +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov 24(r1), r12 +; CHECK-NEXT: mov 26(r1), r5 +; CHECK-NEXT: mov 28(r1), r4 +; CHECK-NEXT: mov 30(r1), r6 +; CHECK-NEXT: mov r7, r11 +; CHECK-NEXT: mov r5, r13 +; CHECK-NEXT: mov r4, r14 +; CHECK-NEXT: mov r6, r15 +; CHECK-NEXT: call #__mspabi_cmpd +; CHECK-NEXT: mov r6, 6(r1) +; CHECK-NEXT: mov r4, 4(r1) +; CHECK-NEXT: mov r5, 2(r1) +; CHECK-NEXT: mov 24(r1), r13 +; CHECK-NEXT: mov r13, 0(r1) +; CHECK-NEXT: tst r12 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: rra r6 +; CHECK-NEXT: and #1, r6 +; CHECK-NEXT: mov r8, r12 +; CHECK-NEXT: mov r9, r13 +; CHECK-NEXT: mov r10, r14 +; CHECK-NEXT: mov r7, r15 +; CHECK-NEXT: call #__unorddf2 +; CHECK-NEXT: bis r6, r12 +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: add #8, r1 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: pop r7 +; CHECK-NEXT: pop r6 +; CHECK-NEXT: pop r5 +; CHECK-NEXT: pop r4 +; CHECK-NEXT: ret + %cmp = fcmp ueq double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_ugt_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_ugt_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: mov r15, r11 +; CHECK-NEXT: mov r14, r10 +; CHECK-NEXT: mov r13, r9 +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov 8(r1), r12 +; CHECK-NEXT: mov 10(r1), r13 +; CHECK-NEXT: mov 12(r1), r14 +; CHECK-NEXT: mov 14(r1), r15 +; CHECK-NEXT: call #__mspabi_cmpd +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: cmp #1, r13 +; CHECK-NEXT: jge .LBB10_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB10_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: ret + %cmp = fcmp ugt double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_uge_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_uge_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: mov r15, r11 +; CHECK-NEXT: mov r14, r10 +; CHECK-NEXT: mov r13, r9 +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov 8(r1), r12 +; CHECK-NEXT: mov 10(r1), r13 +; CHECK-NEXT: mov 12(r1), r14 +; CHECK-NEXT: mov 14(r1), r15 +; CHECK-NEXT: call #__mspabi_cmpd +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: tst r13 +; CHECK-NEXT: jge .LBB11_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB11_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: ret + %cmp = fcmp uge double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_ult_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_ult_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: mov r15, r11 +; CHECK-NEXT: mov r14, r10 +; CHECK-NEXT: mov r13, r9 +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov 8(r1), r12 +; CHECK-NEXT: mov 10(r1), r13 +; CHECK-NEXT: mov 12(r1), r14 +; CHECK-NEXT: mov 14(r1), r15 +; CHECK-NEXT: call #__mspabi_cmpd +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: tst r13 +; CHECK-NEXT: jl .LBB12_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB12_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: ret + %cmp = fcmp ult double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_ule_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_ule_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: mov r15, r11 +; CHECK-NEXT: mov r14, r10 +; CHECK-NEXT: mov r13, r9 +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov 8(r1), r12 +; CHECK-NEXT: mov 10(r1), r13 +; CHECK-NEXT: mov 12(r1), r14 +; CHECK-NEXT: mov 14(r1), r15 +; CHECK-NEXT: call #__mspabi_cmpd +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: cmp #1, r13 +; CHECK-NEXT: jl .LBB13_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB13_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: ret + %cmp = fcmp ule double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_une_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_une_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: mov r15, r11 +; CHECK-NEXT: mov r14, r10 +; CHECK-NEXT: mov r13, r9 +; CHECK-NEXT: mov r12, r8 +; CHECK-NEXT: mov 8(r1), r12 +; CHECK-NEXT: mov 10(r1), r13 +; CHECK-NEXT: mov 12(r1), r14 +; CHECK-NEXT: mov 14(r1), r15 +; CHECK-NEXT: call #__mspabi_cmpd +; CHECK-NEXT: tst r12 +; CHECK-NEXT: mov r2, r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: bic r13, r12 +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: ret + %cmp = fcmp une double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_true_f64(double %a, double %b) #0 { +; CHECK-LABEL: fcmp_true_f64: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov.b #1, r12 +; CHECK-NEXT: ret + %cmp = fcmp true double %a, %b + ret i1 %cmp +} + +define i1 @fcmp_false_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_false_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: clr.b r12 +; CHECK-NEXT: ret + %cmp = fcmp false float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_oeq_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_oeq_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: call #__mspabi_cmpf +; CHECK-NEXT: tst r12 +; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: and #1, r12 +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: ret + %cmp = fcmp oeq float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_ogt_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_ogt_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: call #__mspabi_cmpf +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: cmp #1, r13 +; CHECK-NEXT: jge .LBB18_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB18_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: ret + %cmp = fcmp ogt float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_oge_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_oge_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: call #__mspabi_cmpf +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: tst r13 +; CHECK-NEXT: jge .LBB19_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB19_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: ret + %cmp = fcmp oge float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_olt_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_olt_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: call #__mspabi_cmpf +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: tst r13 +; CHECK-NEXT: jl .LBB20_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB20_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: ret + %cmp = fcmp olt float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_ole_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_ole_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: call #__mspabi_cmpf +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: cmp #1, r13 +; CHECK-NEXT: jl .LBB21_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB21_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: ret + %cmp = fcmp ole float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_one_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_one_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: push r6 +; CHECK-NEXT: push r7 +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: mov r15, r10 +; CHECK-NEXT: mov r14, r9 +; CHECK-NEXT: mov r13, r8 +; CHECK-NEXT: mov r12, r7 +; CHECK-NEXT: call #__unordsf2 +; CHECK-NEXT: tst r12 +; CHECK-NEXT: mov r2, r6 +; CHECK-NEXT: mov r7, r12 +; CHECK-NEXT: mov r8, r13 +; CHECK-NEXT: mov r9, r14 +; CHECK-NEXT: mov r10, r15 +; CHECK-NEXT: call #__mspabi_cmpf +; CHECK-NEXT: tst r12 +; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: rra r6 +; CHECK-NEXT: bic r12, r6 +; CHECK-NEXT: and #1, r6 +; CHECK-NEXT: mov.b r6, r12 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: pop r7 +; CHECK-NEXT: pop r6 +; CHECK-NEXT: ret + %cmp = fcmp one float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_ord_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_ord_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: call #__unordsf2 +; CHECK-NEXT: tst r12 +; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: and #1, r12 +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: ret + %cmp = fcmp ord float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_uno_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_uno_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: call #__unordsf2 +; CHECK-NEXT: tst r12 +; CHECK-NEXT: mov r2, r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: bic r13, r12 +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: ret + %cmp = fcmp uno float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_ueq_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_ueq_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: push r6 +; CHECK-NEXT: push r7 +; CHECK-NEXT: push r8 +; CHECK-NEXT: push r9 +; CHECK-NEXT: push r10 +; CHECK-NEXT: mov r15, r10 +; CHECK-NEXT: mov r14, r9 +; CHECK-NEXT: mov r13, r8 +; CHECK-NEXT: mov r12, r7 +; CHECK-NEXT: call #__unordsf2 +; CHECK-NEXT: mov r12, r6 +; CHECK-NEXT: mov r7, r12 +; CHECK-NEXT: mov r8, r13 +; CHECK-NEXT: mov r9, r14 +; CHECK-NEXT: mov r10, r15 +; CHECK-NEXT: call #__mspabi_cmpf +; CHECK-NEXT: tst r12 +; CHECK-NEXT: mov r2, r12 +; CHECK-NEXT: rra r12 +; CHECK-NEXT: and #1, r12 +; CHECK-NEXT: bis r6, r12 +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: pop r10 +; CHECK-NEXT: pop r9 +; CHECK-NEXT: pop r8 +; CHECK-NEXT: pop r7 +; CHECK-NEXT: pop r6 +; CHECK-NEXT: ret + %cmp = fcmp ueq float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_ugt_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_ugt_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: call #__mspabi_cmpf +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: cmp #1, r13 +; CHECK-NEXT: jge .LBB26_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB26_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: ret + %cmp = fcmp ugt float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_uge_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_uge_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: call #__mspabi_cmpf +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: tst r13 +; CHECK-NEXT: jge .LBB27_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB27_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: ret + %cmp = fcmp uge float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_ult_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_ult_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: call #__mspabi_cmpf +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: tst r13 +; CHECK-NEXT: jl .LBB28_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB28_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: ret + %cmp = fcmp ult float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_ule_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_ule_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: call #__mspabi_cmpf +; CHECK-NEXT: mov r12, r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: cmp #1, r13 +; CHECK-NEXT: jl .LBB29_2 +; CHECK-NEXT: ; %bb.1: +; CHECK-NEXT: clr r12 +; CHECK-NEXT: .LBB29_2: +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: ret + %cmp = fcmp ule float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_une_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_une_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: call #__mspabi_cmpf +; CHECK-NEXT: tst r12 +; CHECK-NEXT: mov r2, r13 +; CHECK-NEXT: rra r13 +; CHECK-NEXT: mov #1, r12 +; CHECK-NEXT: bic r13, r12 +; CHECK-NEXT: ; kill: def $r12b killed $r12b killed $r12 +; CHECK-NEXT: ret + %cmp = fcmp une float %a, %b + ret i1 %cmp +} + +define i1 @fcmp_true_f32(float %a, float %b) #0 { +; CHECK-LABEL: fcmp_true_f32: +; CHECK: ; %bb.0: +; CHECK-NEXT: mov.b #1, r12 +; CHECK-NEXT: ret + %cmp = fcmp true float %a, %b + ret i1 %cmp +} + +attributes #0 = { nounwind } From 953a778fabc48025569fe0d5b3b363b981263f21 Mon Sep 17 00:00:00 2001 From: Serge Pavlov Date: Wed, 11 Jun 2025 19:08:23 +0700 Subject: [PATCH 063/851] [RISCV][FPEnv] Lowering of fpenv intrinsics (#141498) The change implements custom lowering of `get_fpenv`, `set_fpenv` and `reset_fpenv` for RISCV target. --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 44 ++++++++++++++++++++ llvm/lib/Target/RISCV/RISCVISelLowering.h | 3 ++ llvm/lib/Target/RISCV/RISCVInstrInfo.td | 7 ++++ llvm/lib/Target/RISCV/RISCVRegisterInfo.td | 1 + llvm/test/CodeGen/RISCV/fpenv-xlen.ll | 37 ++++++++++++++++ llvm/test/CodeGen/RISCV/frm-write-in-loop.ll | 31 ++++++++++++++ 6 files changed, 123 insertions(+) create mode 100644 llvm/test/CodeGen/RISCV/fpenv-xlen.ll diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 498adee35550c..a157c94849f37 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -649,6 +649,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM, setOperationAction(ISD::GET_ROUNDING, XLenVT, Custom); setOperationAction(ISD::SET_ROUNDING, MVT::Other, Custom); + setOperationAction(ISD::GET_FPENV, XLenVT, Custom); + setOperationAction(ISD::SET_FPENV, XLenVT, Custom); + setOperationAction(ISD::RESET_FPENV, MVT::Other, Custom); } setOperationAction({ISD::GlobalAddress, ISD::BlockAddress, ISD::ConstantPool, @@ -8159,6 +8162,12 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op, return lowerGET_ROUNDING(Op, DAG); case ISD::SET_ROUNDING: return lowerSET_ROUNDING(Op, DAG); + case ISD::GET_FPENV: + return lowerGET_FPENV(Op, DAG); + case ISD::SET_FPENV: + return lowerSET_FPENV(Op, DAG); + case ISD::RESET_FPENV: + return lowerRESET_FPENV(Op, DAG); case ISD::EH_DWARF_CFA: return lowerEH_DWARF_CFA(Op, DAG); case ISD::VP_MERGE: @@ -13799,6 +13808,41 @@ SDValue RISCVTargetLowering::lowerSET_ROUNDING(SDValue Op, RMValue); } +SDValue RISCVTargetLowering::lowerGET_FPENV(SDValue Op, + SelectionDAG &DAG) const { + const MVT XLenVT = Subtarget.getXLenVT(); + SDLoc DL(Op); + SDValue Chain = Op->getOperand(0); + SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT); + SDVTList VTs = DAG.getVTList(XLenVT, MVT::Other); + return DAG.getNode(RISCVISD::READ_CSR, DL, VTs, Chain, SysRegNo); +} + +SDValue RISCVTargetLowering::lowerSET_FPENV(SDValue Op, + SelectionDAG &DAG) const { + const MVT XLenVT = Subtarget.getXLenVT(); + SDLoc DL(Op); + SDValue Chain = Op->getOperand(0); + SDValue EnvValue = Op->getOperand(1); + SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT); + + EnvValue = DAG.getNode(ISD::ZERO_EXTEND, DL, XLenVT, EnvValue); + return DAG.getNode(RISCVISD::WRITE_CSR, DL, MVT::Other, Chain, SysRegNo, + EnvValue); +} + +SDValue RISCVTargetLowering::lowerRESET_FPENV(SDValue Op, + SelectionDAG &DAG) const { + const MVT XLenVT = Subtarget.getXLenVT(); + SDLoc DL(Op); + SDValue Chain = Op->getOperand(0); + SDValue EnvValue = DAG.getRegister(RISCV::X0, XLenVT); + SDValue SysRegNo = DAG.getTargetConstant(RISCVSysReg::fcsr, DL, XLenVT); + + return DAG.getNode(RISCVISD::WRITE_CSR, DL, MVT::Other, Chain, SysRegNo, + EnvValue); +} + SDValue RISCVTargetLowering::lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const { MachineFunction &MF = DAG.getMachineFunction(); diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h index 2ea2bf656ffd7..417d684a62382 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -544,6 +544,9 @@ class RISCVTargetLowering : public TargetLowering { unsigned ExtendOpc) const; SDValue lowerGET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSET_ROUNDING(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerRESET_FPENV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerEH_DWARF_CFA(SDValue Op, SelectionDAG &DAG) const; SDValue lowerCTLZ_CTTZ_ZERO_UNDEF(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td index 0d1ff09f4da3a..70fad925cf070 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -2037,6 +2037,13 @@ let hasSideEffects = true in { def ReadFFLAGS : ReadSysReg; def WriteFFLAGS : WriteSysReg; } + +let hasPostISelHook = 1 in { +def ReadFCSR : ReadSysReg; +def WriteFCSR : WriteSysReg; +def WriteFCSRImm : WriteSysRegImm; +} + /// Other pseudo-instructions // Pessimistically assume the stack pointer will be clobbered diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td index 80213e1503b0a..e87f4523a84f9 100644 --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -846,6 +846,7 @@ foreach m = LMULList in { def FFLAGS : RISCVReg<0, "fflags">; def FRM : RISCVReg<0, "frm">; +def FCSR : RISCVReg<0, "fcsr">; // Shadow Stack register def SSP : RISCVReg<0, "ssp">; diff --git a/llvm/test/CodeGen/RISCV/fpenv-xlen.ll b/llvm/test/CodeGen/RISCV/fpenv-xlen.ll new file mode 100644 index 0000000000000..148186b21c125 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/fpenv-xlen.ll @@ -0,0 +1,37 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+f -verify-machineinstrs | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+f -verify-machineinstrs | FileCheck %s +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zfinx -verify-machineinstrs | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zfinx -verify-machineinstrs | FileCheck %s +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+f -verify-machineinstrs -O0 | FileCheck %s +; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+f -verify-machineinstrs -O0 | FileCheck %s + +define iXLen @func_get_fpenv() { +; CHECK-LABEL: func_get_fpenv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: frcsr a0 +; CHECK-NEXT: ret +entry: + %fpenv = call iXLen @llvm.get.fpenv.iXLen() + ret iXLen %fpenv +} + +define void @func_set_fpenv(iXLen %fpenv) { +; CHECK-LABEL: func_set_fpenv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fscsr a0 +; CHECK-NEXT: ret +entry: + call void @llvm.set.fpenv.iXLen(iXLen %fpenv) + ret void +} + +define void @func_reset_fpenv() { +; CHECK-LABEL: func_reset_fpenv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fscsr zero +; CHECK-NEXT: ret +entry: + call void @llvm.reset.fpenv() + ret void +} diff --git a/llvm/test/CodeGen/RISCV/frm-write-in-loop.ll b/llvm/test/CodeGen/RISCV/frm-write-in-loop.ll index 4f435067343b7..72c5951178276 100644 --- a/llvm/test/CodeGen/RISCV/frm-write-in-loop.ll +++ b/llvm/test/CodeGen/RISCV/frm-write-in-loop.ll @@ -90,3 +90,34 @@ loop: exit: ret double %f2 } + +define double @foo2(double %0, double %1, i64 %n, i64 %fcsr) strictfp { +; CHECK-LABEL: foo2: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: fmv.d.x fa5, zero +; CHECK-NEXT: .LBB2_1: # %loop +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: csrwi fcsr, 0 +; CHECK-NEXT: fadd.d fa5, fa5, fa0 +; CHECK-NEXT: addi a0, a0, -1 +; CHECK-NEXT: fscsr a1 +; CHECK-NEXT: fadd.d fa5, fa5, fa1 +; CHECK-NEXT: beqz a0, .LBB2_1 +; CHECK-NEXT: # %bb.2: # %exit +; CHECK-NEXT: fmv.d fa0, fa5 +; CHECK-NEXT: ret +entry: + br label %loop +loop: + %cnt = phi i64 [0, %entry], [%cnt_inc, %loop] + %acc = phi double [0.0, %entry], [%f2, %loop] + call void @llvm.set.fpenv(i64 0) strictfp + %f1 = call double @llvm.experimental.constrained.fadd.f64(double %acc, double %0, metadata !"round.dynamic", metadata !"fpexcept.ignore") strictfp + call void @llvm.set.fpenv(i64 %fcsr) strictfp + %f2 = call double @llvm.experimental.constrained.fadd.f64(double %f1, double %1, metadata !"round.dynamic", metadata !"fpexcept.ignore") strictfp + %cnt_inc = add i64 %cnt, 1 + %cond = icmp eq i64 %cnt_inc, %n + br i1 %cond, label %loop, label %exit +exit: + ret double %f2 +} From 4a46ead8fb5b57e69bcd1c72ebd7dd8eaf09fa9c Mon Sep 17 00:00:00 2001 From: Adrian Vogelsgesang Date: Wed, 11 Jun 2025 14:09:54 +0200 Subject: [PATCH 064/851] [lldb] Show coro_frame in `std::coroutine_handle` pretty printer (#141516) This commit adjusts the pretty printer for `std::coroutine_handle` based on recent personal experiences with debugging C++20 coroutines: 1. It adds the `coro_frame` member. This member exposes the complete coroutine frame contents, including the suspension point id and all internal variables which the compiler decided to persist into the coroutine frame. While this data is highly compiler-specific, inspecting it can help identify the internal state of suspended coroutines. 2. It includes the `promise` and `coro_frame` members, even if devirtualization failed and we could not infer the promise type / the coro_frame type. Having them available as `void*` pointers can still be useful to identify, e.g., which two coroutine handles have the same frame / promise pointers. --- .../lldb/DataFormatters/TypeSynthetic.h | 2 +- lldb/source/DataFormatters/TypeSynthetic.cpp | 6 +- .../Plugins/Language/CPlusPlus/Coroutines.cpp | 145 ++++++++---------- .../Plugins/Language/CPlusPlus/Coroutines.h | 4 +- .../coroutine_handle/TestCoroutineHandle.py | 46 ++++-- 5 files changed, 101 insertions(+), 102 deletions(-) diff --git a/lldb/include/lldb/DataFormatters/TypeSynthetic.h b/lldb/include/lldb/DataFormatters/TypeSynthetic.h index 37f02fb8f7ce5..11a4ca2cd8330 100644 --- a/lldb/include/lldb/DataFormatters/TypeSynthetic.h +++ b/lldb/include/lldb/DataFormatters/TypeSynthetic.h @@ -92,7 +92,7 @@ class SyntheticChildrenFrontEnd { lldb::ValueObjectSP CreateValueObjectFromAddress(llvm::StringRef name, uint64_t address, const ExecutionContext &exe_ctx, - CompilerType type); + CompilerType type, bool do_deref = true); lldb::ValueObjectSP CreateValueObjectFromData(llvm::StringRef name, const DataExtractor &data, diff --git a/lldb/source/DataFormatters/TypeSynthetic.cpp b/lldb/source/DataFormatters/TypeSynthetic.cpp index 57009b07dc553..33af0ad63077f 100644 --- a/lldb/source/DataFormatters/TypeSynthetic.cpp +++ b/lldb/source/DataFormatters/TypeSynthetic.cpp @@ -138,9 +138,9 @@ lldb::ValueObjectSP SyntheticChildrenFrontEnd::CreateValueObjectFromExpression( lldb::ValueObjectSP SyntheticChildrenFrontEnd::CreateValueObjectFromAddress( llvm::StringRef name, uint64_t address, const ExecutionContext &exe_ctx, - CompilerType type) { - ValueObjectSP valobj_sp( - ValueObject::CreateValueObjectFromAddress(name, address, exe_ctx, type)); + CompilerType type, bool do_deref) { + ValueObjectSP valobj_sp(ValueObject::CreateValueObjectFromAddress( + name, address, exe_ctx, type, do_deref)); if (valobj_sp) valobj_sp->SetSyntheticChildrenGenerated(true); return valobj_sp; diff --git a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp index 9d84af4a85384..e8c2db1886333 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp +++ b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.cpp @@ -11,8 +11,6 @@ #include "Plugins/TypeSystem/Clang/TypeSystemClang.h" #include "lldb/Symbol/Function.h" #include "lldb/Symbol/VariableList.h" -#include "lldb/Utility/LLDBLog.h" -#include "lldb/Utility/Log.h" using namespace lldb; using namespace lldb_private; @@ -61,19 +59,23 @@ static Function *ExtractDestroyFunction(lldb::TargetSP target_sp, return destroy_func_address.CalculateSymbolContextFunction(); } -static CompilerType InferPromiseType(Function &destroy_func) { - Block &block = destroy_func.GetBlock(true); +// clang generates aritifical `__promise` and `__coro_frame` variables inside +// the destroy function. Look for those variables and extract their type. +static CompilerType InferArtificialCoroType(Function *destroy_func, + ConstString var_name) { + if (!destroy_func) + return {}; + + Block &block = destroy_func->GetBlock(true); auto variable_list = block.GetBlockVariableList(true); - // clang generates an artificial `__promise` variable inside the - // `destroy` function. Look for it. - auto promise_var = variable_list->FindVariable(ConstString("__promise")); - if (!promise_var) + auto var = variable_list->FindVariable(var_name); + if (!var) return {}; - if (!promise_var->IsArtificial()) + if (!var->IsArtificial()) return {}; - Type *promise_type = promise_var->GetType(); + Type *promise_type = var->GetType(); if (!promise_type) return {}; return promise_type->GetForwardCompilerType(); @@ -107,30 +109,17 @@ lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd:: llvm::Expected lldb_private::formatters:: StdlibCoroutineHandleSyntheticFrontEnd::CalculateNumChildren() { - if (!m_resume_ptr_sp || !m_destroy_ptr_sp) - return 0; - - return m_promise_ptr_sp ? 3 : 2; + return m_children.size(); } lldb::ValueObjectSP lldb_private::formatters:: StdlibCoroutineHandleSyntheticFrontEnd::GetChildAtIndex(uint32_t idx) { - switch (idx) { - case 0: - return m_resume_ptr_sp; - case 1: - return m_destroy_ptr_sp; - case 2: - return m_promise_ptr_sp; - } - return lldb::ValueObjectSP(); + return idx < m_children.size() ? m_children[idx] : lldb::ValueObjectSP(); } lldb::ChildCacheState lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::Update() { - m_resume_ptr_sp.reset(); - m_destroy_ptr_sp.reset(); - m_promise_ptr_sp.reset(); + m_children.clear(); ValueObjectSP valobj_sp = m_backend.GetNonSyntheticValue(); if (!valobj_sp) @@ -140,60 +129,66 @@ lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::Update() { if (frame_ptr_addr == 0 || frame_ptr_addr == LLDB_INVALID_ADDRESS) return lldb::ChildCacheState::eRefetch; - auto ast_ctx = valobj_sp->GetCompilerType().GetTypeSystem(); - if (!ast_ctx) - return lldb::ChildCacheState::eRefetch; - - // Create the `resume` and `destroy` children. lldb::TargetSP target_sp = m_backend.GetTargetSP(); auto &exe_ctx = m_backend.GetExecutionContextRef(); lldb::ProcessSP process_sp = target_sp->GetProcessSP(); auto ptr_size = process_sp->GetAddressByteSize(); - CompilerType void_type = ast_ctx->GetBasicType(lldb::eBasicTypeVoid); - std::array args{void_type}; - CompilerType coro_func_type = ast_ctx->CreateFunctionType( - /*result_type=*/void_type, args, - /*is_variadic=*/false, /*qualifiers=*/0); - CompilerType coro_func_ptr_type = coro_func_type.GetPointerType(); - m_resume_ptr_sp = CreateValueObjectFromAddress( - "resume", frame_ptr_addr + 0 * ptr_size, exe_ctx, coro_func_ptr_type); - lldbassert(m_resume_ptr_sp); - m_destroy_ptr_sp = CreateValueObjectFromAddress( - "destroy", frame_ptr_addr + 1 * ptr_size, exe_ctx, coro_func_ptr_type); - lldbassert(m_destroy_ptr_sp); - - // Get the `promise_type` from the template argument - CompilerType promise_type( - valobj_sp->GetCompilerType().GetTypeTemplateArgument(0)); - if (!promise_type) + auto ast_ctx = valobj_sp->GetCompilerType().GetTypeSystem(); + if (!ast_ctx) return lldb::ChildCacheState::eRefetch; - // Try to infer the promise_type if it was type-erased + // Determine the coroutine frame type and the promise type. Fall back + // to `void`, since even the pointer itself might be useful, even if the + // type inference failed. + Function *destroy_func = ExtractDestroyFunction(target_sp, frame_ptr_addr); + CompilerType void_type = ast_ctx->GetBasicType(lldb::eBasicTypeVoid); + CompilerType promise_type; + if (CompilerType template_arg = + valobj_sp->GetCompilerType().GetTypeTemplateArgument(0)) + promise_type = std::move(template_arg); if (promise_type.IsVoidType()) { - if (Function *destroy_func = - ExtractDestroyFunction(target_sp, frame_ptr_addr)) { - if (CompilerType inferred_type = InferPromiseType(*destroy_func)) { + // Try to infer the promise_type if it was type-erased + if (destroy_func) { + if (CompilerType inferred_type = + InferArtificialCoroType(destroy_func, ConstString("__promise"))) { promise_type = inferred_type; } } } + CompilerType coro_frame_type = + InferArtificialCoroType(destroy_func, ConstString("__coro_frame")); + if (!coro_frame_type) + coro_frame_type = void_type; - // If we don't know the promise type, we don't display the `promise` member. - // `CreateValueObjectFromAddress` below would fail for `void` types. - if (promise_type.IsVoidType()) { - return lldb::ChildCacheState::eRefetch; - } - - // Add the `promise` member. We intentionally add `promise` as a pointer type - // instead of a value type, and don't automatically dereference this pointer. - // We do so to avoid potential very deep recursion in case there is a cycle - // formed between `std::coroutine_handle`s and their promises. - lldb::ValueObjectSP promise = CreateValueObjectFromAddress( - "promise", frame_ptr_addr + 2 * ptr_size, exe_ctx, promise_type); - Status error; - lldb::ValueObjectSP promisePtr = promise->AddressOf(error); - if (error.Success()) - m_promise_ptr_sp = promisePtr->Clone(ConstString("promise")); + // Create the `resume` and `destroy` children. + std::array args{coro_frame_type}; + CompilerType coro_func_type = ast_ctx->CreateFunctionType( + /*result_type=*/void_type, args, + /*is_variadic=*/false, /*qualifiers=*/0); + CompilerType coro_func_ptr_type = coro_func_type.GetPointerType(); + ValueObjectSP resume_ptr_sp = CreateValueObjectFromAddress( + "resume", frame_ptr_addr + 0 * ptr_size, exe_ctx, coro_func_ptr_type); + assert(resume_ptr_sp); + m_children.push_back(std::move(resume_ptr_sp)); + ValueObjectSP destroy_ptr_sp = CreateValueObjectFromAddress( + "destroy", frame_ptr_addr + 1 * ptr_size, exe_ctx, coro_func_ptr_type); + assert(destroy_ptr_sp); + m_children.push_back(std::move(destroy_ptr_sp)); + + // Add promise and coro_frame + // Add the `promise` and `coro_frame` member. We intentionally add them as + // pointer types instead of a value type, and don't automatically dereference + // those pointers. We do so to avoid potential very deep recursion in case + // there is a cycle formed between `std::coroutine_handle`s and their + // promises. + ValueObjectSP promise_ptr_sp = CreateValueObjectFromAddress( + "promise", frame_ptr_addr + 2 * ptr_size, exe_ctx, + promise_type.GetPointerType(), /*do_deref=*/false); + m_children.push_back(std::move(promise_ptr_sp)); + ValueObjectSP coroframe_ptr_sp = CreateValueObjectFromAddress( + "coro_frame", frame_ptr_addr, exe_ctx, coro_frame_type.GetPointerType(), + /*do_deref=*/false); + m_children.push_back(std::move(coroframe_ptr_sp)); return lldb::ChildCacheState::eRefetch; } @@ -201,16 +196,10 @@ lldb_private::formatters::StdlibCoroutineHandleSyntheticFrontEnd::Update() { llvm::Expected StdlibCoroutineHandleSyntheticFrontEnd::GetIndexOfChildWithName( ConstString name) { - if (!m_resume_ptr_sp || !m_destroy_ptr_sp) - return llvm::createStringError("Type has no child named '%s'", - name.AsCString()); - - if (name == ConstString("resume")) - return 0; - if (name == ConstString("destroy")) - return 1; - if (name == ConstString("promise_ptr") && m_promise_ptr_sp) - return 2; + for (const auto &[idx, child_sp] : llvm::enumerate(m_children)) { + if (child_sp->GetName() == name) + return idx; + } return llvm::createStringError("Type has no child named '%s'", name.AsCString()); diff --git a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h index fd9445d46e6a0..520d8e0b3c79d 100644 --- a/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h +++ b/lldb/source/Plugins/Language/CPlusPlus/Coroutines.h @@ -43,9 +43,7 @@ class StdlibCoroutineHandleSyntheticFrontEnd llvm::Expected GetIndexOfChildWithName(ConstString name) override; private: - lldb::ValueObjectSP m_resume_ptr_sp; - lldb::ValueObjectSP m_destroy_ptr_sp; - lldb::ValueObjectSP m_promise_ptr_sp; + std::vector m_children; }; SyntheticChildrenFrontEnd * diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py index f471ea728f953..54bb661057cd6 100644 --- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py +++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/generic/coroutine_handle/TestCoroutineHandle.py @@ -46,11 +46,17 @@ def do_test(self, stdlib_type): ValueCheck(name="current_value", value="-1"), ], ), + # We don not check any members inside the `coro_frame`, + # as its contents are highly compiler-specific. + ValueCheck(name="coro_frame"), ], ) + + # For a type-erased `coroutine_handle<>`, we can still devirtualize + # the promise call and display the correctly typed promise. This + # currently only works in clang, because gcc is not adding the + # artificial `__promise` variable to the destroy function. if is_clang: - # For a type-erased `coroutine_handle<>`, we can still devirtualize - # the promise call and display the correctly typed promise. self.expect_expr( "type_erased_hdl", result_summary=re.compile("^coro frame = 0x[0-9a-f]*$"), @@ -63,23 +69,26 @@ def do_test(self, stdlib_type): ValueCheck(name="current_value", value="-1"), ], ), + ValueCheck(name="coro_frame"), ], ) - # For an incorrectly typed `coroutine_handle`, we use the user-supplied - # incorrect type instead of inferring the correct type. Strictly speaking, - # incorrectly typed coroutine handles are undefined behavior. However, - # it provides probably a better debugging experience if we display the - # promise as seen by the program instead of fixing this bug based on - # the available debug info. - self.expect_expr( - "incorrectly_typed_hdl", - result_summary=re.compile("^coro frame = 0x[0-9a-f]*$"), - result_children=[ - ValueCheck(name="resume", summary=test_generator_func_ptr_re), - ValueCheck(name="destroy", summary=test_generator_func_ptr_re), - ValueCheck(name="promise", dereference=ValueCheck(value="-1")), - ], - ) + + # For an incorrectly typed `coroutine_handle`, we use the user-supplied + # incorrect type instead of inferring the correct type. Strictly speaking, + # incorrectly typed coroutine handles are undefined behavior. However, + # it provides probably a better debugging experience if we display the + # promise as seen by the program instead of fixing this bug based on + # the available debug info. + self.expect_expr( + "incorrectly_typed_hdl", + result_summary=re.compile("^coro frame = 0x[0-9a-f]*$"), + result_children=[ + ValueCheck(name="resume", summary=test_generator_func_ptr_re), + ValueCheck(name="destroy", summary=test_generator_func_ptr_re), + ValueCheck(name="promise", dereference=ValueCheck(value="-1")), + ValueCheck(name="coro_frame"), + ], + ) process = self.process() @@ -110,6 +119,7 @@ def do_test(self, stdlib_type): ValueCheck(name="current_value", value="42"), ], ), + ValueCheck(name="coro_frame"), ], ) @@ -133,6 +143,7 @@ def do_test(self, stdlib_type): ValueCheck(name="current_value", value="42"), ], ), + ValueCheck(name="coro_frame"), ], ) if is_clang: @@ -150,6 +161,7 @@ def do_test(self, stdlib_type): ValueCheck(name="current_value", value="42"), ], ), + ValueCheck(name="coro_frame"), ], ) From 3ef7d035e21d8f75eb85b521d7ff0203e60cb6f2 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 11 Jun 2025 21:14:59 +0900 Subject: [PATCH 065/851] MSP430: Stop using setCmpLibcallCC (#142708) This appears to only be useful for the eq/ne cases, and only for ARM libcalls. This is setting it to the default values, and there's no change in the new fcmp test output. --- llvm/lib/Target/MSP430/MSP430ISelLowering.cpp | 183 +++++++++--------- 1 file changed, 89 insertions(+), 94 deletions(-) diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp index 5589cea6e675d..8c55f77d062b7 100644 --- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -151,104 +151,99 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, // EABI Libcalls - EABI Section 6.2 const struct { const RTLIB::Libcall Op; - const char * const Name; - const ISD::CondCode Cond; + const char *const Name; } LibraryCalls[] = { - // Floating point conversions - EABI Table 6 - { RTLIB::FPROUND_F64_F32, "__mspabi_cvtdf", ISD::SETCC_INVALID }, - { RTLIB::FPEXT_F32_F64, "__mspabi_cvtfd", ISD::SETCC_INVALID }, - // The following is NOT implemented in libgcc - //{ RTLIB::FPTOSINT_F64_I16, "__mspabi_fixdi", ISD::SETCC_INVALID }, - { RTLIB::FPTOSINT_F64_I32, "__mspabi_fixdli", ISD::SETCC_INVALID }, - { RTLIB::FPTOSINT_F64_I64, "__mspabi_fixdlli", ISD::SETCC_INVALID }, - // The following is NOT implemented in libgcc - //{ RTLIB::FPTOUINT_F64_I16, "__mspabi_fixdu", ISD::SETCC_INVALID }, - { RTLIB::FPTOUINT_F64_I32, "__mspabi_fixdul", ISD::SETCC_INVALID }, - { RTLIB::FPTOUINT_F64_I64, "__mspabi_fixdull", ISD::SETCC_INVALID }, - // The following is NOT implemented in libgcc - //{ RTLIB::FPTOSINT_F32_I16, "__mspabi_fixfi", ISD::SETCC_INVALID }, - { RTLIB::FPTOSINT_F32_I32, "__mspabi_fixfli", ISD::SETCC_INVALID }, - { RTLIB::FPTOSINT_F32_I64, "__mspabi_fixflli", ISD::SETCC_INVALID }, - // The following is NOT implemented in libgcc - //{ RTLIB::FPTOUINT_F32_I16, "__mspabi_fixfu", ISD::SETCC_INVALID }, - { RTLIB::FPTOUINT_F32_I32, "__mspabi_fixful", ISD::SETCC_INVALID }, - { RTLIB::FPTOUINT_F32_I64, "__mspabi_fixfull", ISD::SETCC_INVALID }, - // TODO The following IS implemented in libgcc - //{ RTLIB::SINTTOFP_I16_F64, "__mspabi_fltid", ISD::SETCC_INVALID }, - { RTLIB::SINTTOFP_I32_F64, "__mspabi_fltlid", ISD::SETCC_INVALID }, - // TODO The following IS implemented in libgcc but is not in the EABI - { RTLIB::SINTTOFP_I64_F64, "__mspabi_fltllid", ISD::SETCC_INVALID }, - // TODO The following IS implemented in libgcc - //{ RTLIB::UINTTOFP_I16_F64, "__mspabi_fltud", ISD::SETCC_INVALID }, - { RTLIB::UINTTOFP_I32_F64, "__mspabi_fltuld", ISD::SETCC_INVALID }, - // The following IS implemented in libgcc but is not in the EABI - { RTLIB::UINTTOFP_I64_F64, "__mspabi_fltulld", ISD::SETCC_INVALID }, - // TODO The following IS implemented in libgcc - //{ RTLIB::SINTTOFP_I16_F32, "__mspabi_fltif", ISD::SETCC_INVALID }, - { RTLIB::SINTTOFP_I32_F32, "__mspabi_fltlif", ISD::SETCC_INVALID }, - // TODO The following IS implemented in libgcc but is not in the EABI - { RTLIB::SINTTOFP_I64_F32, "__mspabi_fltllif", ISD::SETCC_INVALID }, - // TODO The following IS implemented in libgcc - //{ RTLIB::UINTTOFP_I16_F32, "__mspabi_fltuf", ISD::SETCC_INVALID }, - { RTLIB::UINTTOFP_I32_F32, "__mspabi_fltulf", ISD::SETCC_INVALID }, - // The following IS implemented in libgcc but is not in the EABI - { RTLIB::UINTTOFP_I64_F32, "__mspabi_fltullf", ISD::SETCC_INVALID }, - - // Floating point comparisons - EABI Table 7 - { RTLIB::OEQ_F64, "__mspabi_cmpd", ISD::SETEQ }, - { RTLIB::UNE_F64, "__mspabi_cmpd", ISD::SETNE }, - { RTLIB::OGE_F64, "__mspabi_cmpd", ISD::SETGE }, - { RTLIB::OLT_F64, "__mspabi_cmpd", ISD::SETLT }, - { RTLIB::OLE_F64, "__mspabi_cmpd", ISD::SETLE }, - { RTLIB::OGT_F64, "__mspabi_cmpd", ISD::SETGT }, - { RTLIB::OEQ_F32, "__mspabi_cmpf", ISD::SETEQ }, - { RTLIB::UNE_F32, "__mspabi_cmpf", ISD::SETNE }, - { RTLIB::OGE_F32, "__mspabi_cmpf", ISD::SETGE }, - { RTLIB::OLT_F32, "__mspabi_cmpf", ISD::SETLT }, - { RTLIB::OLE_F32, "__mspabi_cmpf", ISD::SETLE }, - { RTLIB::OGT_F32, "__mspabi_cmpf", ISD::SETGT }, - - // Floating point arithmetic - EABI Table 8 - { RTLIB::ADD_F64, "__mspabi_addd", ISD::SETCC_INVALID }, - { RTLIB::ADD_F32, "__mspabi_addf", ISD::SETCC_INVALID }, - { RTLIB::DIV_F64, "__mspabi_divd", ISD::SETCC_INVALID }, - { RTLIB::DIV_F32, "__mspabi_divf", ISD::SETCC_INVALID }, - { RTLIB::MUL_F64, "__mspabi_mpyd", ISD::SETCC_INVALID }, - { RTLIB::MUL_F32, "__mspabi_mpyf", ISD::SETCC_INVALID }, - { RTLIB::SUB_F64, "__mspabi_subd", ISD::SETCC_INVALID }, - { RTLIB::SUB_F32, "__mspabi_subf", ISD::SETCC_INVALID }, - // The following are NOT implemented in libgcc - // { RTLIB::NEG_F64, "__mspabi_negd", ISD::SETCC_INVALID }, - // { RTLIB::NEG_F32, "__mspabi_negf", ISD::SETCC_INVALID }, - - // Universal Integer Operations - EABI Table 9 - { RTLIB::SDIV_I16, "__mspabi_divi", ISD::SETCC_INVALID }, - { RTLIB::SDIV_I32, "__mspabi_divli", ISD::SETCC_INVALID }, - { RTLIB::SDIV_I64, "__mspabi_divlli", ISD::SETCC_INVALID }, - { RTLIB::UDIV_I16, "__mspabi_divu", ISD::SETCC_INVALID }, - { RTLIB::UDIV_I32, "__mspabi_divul", ISD::SETCC_INVALID }, - { RTLIB::UDIV_I64, "__mspabi_divull", ISD::SETCC_INVALID }, - { RTLIB::SREM_I16, "__mspabi_remi", ISD::SETCC_INVALID }, - { RTLIB::SREM_I32, "__mspabi_remli", ISD::SETCC_INVALID }, - { RTLIB::SREM_I64, "__mspabi_remlli", ISD::SETCC_INVALID }, - { RTLIB::UREM_I16, "__mspabi_remu", ISD::SETCC_INVALID }, - { RTLIB::UREM_I32, "__mspabi_remul", ISD::SETCC_INVALID }, - { RTLIB::UREM_I64, "__mspabi_remull", ISD::SETCC_INVALID }, - - // Bitwise Operations - EABI Table 10 - // TODO: __mspabi_[srli/srai/slli] ARE implemented in libgcc - { RTLIB::SRL_I32, "__mspabi_srll", ISD::SETCC_INVALID }, - { RTLIB::SRA_I32, "__mspabi_sral", ISD::SETCC_INVALID }, - { RTLIB::SHL_I32, "__mspabi_slll", ISD::SETCC_INVALID }, - // __mspabi_[srlll/srall/sllll/rlli/rlll] are NOT implemented in libgcc - + // Floating point conversions - EABI Table 6 + {RTLIB::FPROUND_F64_F32, "__mspabi_cvtdf"}, + {RTLIB::FPEXT_F32_F64, "__mspabi_cvtfd"}, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOSINT_F64_I16, "__mspabi_fixdi" }, + {RTLIB::FPTOSINT_F64_I32, "__mspabi_fixdli"}, + {RTLIB::FPTOSINT_F64_I64, "__mspabi_fixdlli"}, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOUINT_F64_I16, "__mspabi_fixdu" }, + {RTLIB::FPTOUINT_F64_I32, "__mspabi_fixdul"}, + {RTLIB::FPTOUINT_F64_I64, "__mspabi_fixdull"}, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOSINT_F32_I16, "__mspabi_fixfi" }, + {RTLIB::FPTOSINT_F32_I32, "__mspabi_fixfli"}, + {RTLIB::FPTOSINT_F32_I64, "__mspabi_fixflli"}, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOUINT_F32_I16, "__mspabi_fixfu" }, + {RTLIB::FPTOUINT_F32_I32, "__mspabi_fixful"}, + {RTLIB::FPTOUINT_F32_I64, "__mspabi_fixfull"}, + // TODO The following IS implemented in libgcc + //{ RTLIB::SINTTOFP_I16_F64, "__mspabi_fltid" }, + {RTLIB::SINTTOFP_I32_F64, "__mspabi_fltlid"}, + // TODO The following IS implemented in libgcc but is not in the EABI + {RTLIB::SINTTOFP_I64_F64, "__mspabi_fltllid"}, + // TODO The following IS implemented in libgcc + //{ RTLIB::UINTTOFP_I16_F64, "__mspabi_fltud" }, + {RTLIB::UINTTOFP_I32_F64, "__mspabi_fltuld"}, + // The following IS implemented in libgcc but is not in the EABI + {RTLIB::UINTTOFP_I64_F64, "__mspabi_fltulld"}, + // TODO The following IS implemented in libgcc + //{ RTLIB::SINTTOFP_I16_F32, "__mspabi_fltif" }, + {RTLIB::SINTTOFP_I32_F32, "__mspabi_fltlif"}, + // TODO The following IS implemented in libgcc but is not in the EABI + {RTLIB::SINTTOFP_I64_F32, "__mspabi_fltllif"}, + // TODO The following IS implemented in libgcc + //{ RTLIB::UINTTOFP_I16_F32, "__mspabi_fltuf" }, + {RTLIB::UINTTOFP_I32_F32, "__mspabi_fltulf"}, + // The following IS implemented in libgcc but is not in the EABI + {RTLIB::UINTTOFP_I64_F32, "__mspabi_fltullf"}, + + // Floating point comparisons - EABI Table 7 + {RTLIB::OEQ_F64, "__mspabi_cmpd"}, + {RTLIB::UNE_F64, "__mspabi_cmpd"}, + {RTLIB::OGE_F64, "__mspabi_cmpd"}, + {RTLIB::OLT_F64, "__mspabi_cmpd"}, + {RTLIB::OLE_F64, "__mspabi_cmpd"}, + {RTLIB::OGT_F64, "__mspabi_cmpd"}, + {RTLIB::OEQ_F32, "__mspabi_cmpf"}, + {RTLIB::UNE_F32, "__mspabi_cmpf"}, + {RTLIB::OGE_F32, "__mspabi_cmpf"}, + {RTLIB::OLT_F32, "__mspabi_cmpf"}, + {RTLIB::OLE_F32, "__mspabi_cmpf"}, + {RTLIB::OGT_F32, "__mspabi_cmpf"}, + + // Floating point arithmetic - EABI Table 8 + {RTLIB::ADD_F64, "__mspabi_addd"}, + {RTLIB::ADD_F32, "__mspabi_addf"}, + {RTLIB::DIV_F64, "__mspabi_divd"}, + {RTLIB::DIV_F32, "__mspabi_divf"}, + {RTLIB::MUL_F64, "__mspabi_mpyd"}, + {RTLIB::MUL_F32, "__mspabi_mpyf"}, + {RTLIB::SUB_F64, "__mspabi_subd"}, + {RTLIB::SUB_F32, "__mspabi_subf"}, + // The following are NOT implemented in libgcc + // { RTLIB::NEG_F64, "__mspabi_negd" }, + // { RTLIB::NEG_F32, "__mspabi_negf" }, + + // Universal Integer Operations - EABI Table 9 + {RTLIB::SDIV_I16, "__mspabi_divi"}, + {RTLIB::SDIV_I32, "__mspabi_divli"}, + {RTLIB::SDIV_I64, "__mspabi_divlli"}, + {RTLIB::UDIV_I16, "__mspabi_divu"}, + {RTLIB::UDIV_I32, "__mspabi_divul"}, + {RTLIB::UDIV_I64, "__mspabi_divull"}, + {RTLIB::SREM_I16, "__mspabi_remi"}, + {RTLIB::SREM_I32, "__mspabi_remli"}, + {RTLIB::SREM_I64, "__mspabi_remlli"}, + {RTLIB::UREM_I16, "__mspabi_remu"}, + {RTLIB::UREM_I32, "__mspabi_remul"}, + {RTLIB::UREM_I64, "__mspabi_remull"}, + + // Bitwise Operations - EABI Table 10 + // TODO: __mspabi_[srli/srai/slli] ARE implemented in libgcc + {RTLIB::SRL_I32, "__mspabi_srll"}, + {RTLIB::SRA_I32, "__mspabi_sral"}, + {RTLIB::SHL_I32, "__mspabi_slll"}, + // __mspabi_[srlll/srall/sllll/rlli/rlll] are NOT implemented in libgcc }; - for (const auto &LC : LibraryCalls) { + for (const auto &LC : LibraryCalls) setLibcallName(LC.Op, LC.Name); - if (LC.Cond != ISD::SETCC_INVALID) - setCmpLibcallCC(LC.Op, LC.Cond); - } if (STI.hasHWMult16()) { const struct { From ac7fa4099e83d6490d2f9ea185b236db2f26e652 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 11 Jun 2025 21:17:58 +0900 Subject: [PATCH 066/851] MSP430: Partially move runtime libcall config out of TargetLowering (#142709) RuntimeLibcalls needs to be correct outside of codegen contexts. --- llvm/lib/IR/RuntimeLibcalls.cpp | 120 ++++++++++++++++++ llvm/lib/Target/MSP430/MSP430ISelLowering.cpp | 114 ----------------- 2 files changed, 120 insertions(+), 114 deletions(-) diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 541379e7ade48..31013310a746d 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -81,6 +81,123 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) { } } +static void setMSP430Libcalls(RuntimeLibcallsInfo &Info, const Triple &TT) { + // EABI Libcalls - EABI Section 6.2 + const struct { + const RTLIB::Libcall Op; + const char *const Name; + } LibraryCalls[] = { + // Floating point conversions - EABI Table 6 + {RTLIB::FPROUND_F64_F32, "__mspabi_cvtdf"}, + {RTLIB::FPEXT_F32_F64, "__mspabi_cvtfd"}, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOSINT_F64_I16, "__mspabi_fixdi" }, + {RTLIB::FPTOSINT_F64_I32, "__mspabi_fixdli"}, + {RTLIB::FPTOSINT_F64_I64, "__mspabi_fixdlli"}, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOUINT_F64_I16, "__mspabi_fixdu" }, + {RTLIB::FPTOUINT_F64_I32, "__mspabi_fixdul"}, + {RTLIB::FPTOUINT_F64_I64, "__mspabi_fixdull"}, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOSINT_F32_I16, "__mspabi_fixfi" }, + {RTLIB::FPTOSINT_F32_I32, "__mspabi_fixfli"}, + {RTLIB::FPTOSINT_F32_I64, "__mspabi_fixflli"}, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOUINT_F32_I16, "__mspabi_fixfu" }, + {RTLIB::FPTOUINT_F32_I32, "__mspabi_fixful"}, + {RTLIB::FPTOUINT_F32_I64, "__mspabi_fixfull"}, + // TODO The following IS implemented in libgcc + //{ RTLIB::SINTTOFP_I16_F64, "__mspabi_fltid" }, + {RTLIB::SINTTOFP_I32_F64, "__mspabi_fltlid"}, + // TODO The following IS implemented in libgcc but is not in the EABI + {RTLIB::SINTTOFP_I64_F64, "__mspabi_fltllid"}, + // TODO The following IS implemented in libgcc + //{ RTLIB::UINTTOFP_I16_F64, "__mspabi_fltud" }, + {RTLIB::UINTTOFP_I32_F64, "__mspabi_fltuld"}, + // The following IS implemented in libgcc but is not in the EABI + {RTLIB::UINTTOFP_I64_F64, "__mspabi_fltulld"}, + // TODO The following IS implemented in libgcc + //{ RTLIB::SINTTOFP_I16_F32, "__mspabi_fltif" }, + {RTLIB::SINTTOFP_I32_F32, "__mspabi_fltlif"}, + // TODO The following IS implemented in libgcc but is not in the EABI + {RTLIB::SINTTOFP_I64_F32, "__mspabi_fltllif"}, + // TODO The following IS implemented in libgcc + //{ RTLIB::UINTTOFP_I16_F32, "__mspabi_fltuf" }, + {RTLIB::UINTTOFP_I32_F32, "__mspabi_fltulf"}, + // The following IS implemented in libgcc but is not in the EABI + {RTLIB::UINTTOFP_I64_F32, "__mspabi_fltullf"}, + + // Floating point comparisons - EABI Table 7 + {RTLIB::OEQ_F64, "__mspabi_cmpd"}, + {RTLIB::UNE_F64, "__mspabi_cmpd"}, + {RTLIB::OGE_F64, "__mspabi_cmpd"}, + {RTLIB::OLT_F64, "__mspabi_cmpd"}, + {RTLIB::OLE_F64, "__mspabi_cmpd"}, + {RTLIB::OGT_F64, "__mspabi_cmpd"}, + {RTLIB::OEQ_F32, "__mspabi_cmpf"}, + {RTLIB::UNE_F32, "__mspabi_cmpf"}, + {RTLIB::OGE_F32, "__mspabi_cmpf"}, + {RTLIB::OLT_F32, "__mspabi_cmpf"}, + {RTLIB::OLE_F32, "__mspabi_cmpf"}, + {RTLIB::OGT_F32, "__mspabi_cmpf"}, + + // Floating point arithmetic - EABI Table 8 + {RTLIB::ADD_F64, "__mspabi_addd"}, + {RTLIB::ADD_F32, "__mspabi_addf"}, + {RTLIB::DIV_F64, "__mspabi_divd"}, + {RTLIB::DIV_F32, "__mspabi_divf"}, + {RTLIB::MUL_F64, "__mspabi_mpyd"}, + {RTLIB::MUL_F32, "__mspabi_mpyf"}, + {RTLIB::SUB_F64, "__mspabi_subd"}, + {RTLIB::SUB_F32, "__mspabi_subf"}, + // The following are NOT implemented in libgcc + // { RTLIB::NEG_F64, "__mspabi_negd" }, + // { RTLIB::NEG_F32, "__mspabi_negf" }, + + // Universal Integer Operations - EABI Table 9 + {RTLIB::SDIV_I16, "__mspabi_divi"}, + {RTLIB::SDIV_I32, "__mspabi_divli"}, + {RTLIB::SDIV_I64, "__mspabi_divlli"}, + {RTLIB::UDIV_I16, "__mspabi_divu"}, + {RTLIB::UDIV_I32, "__mspabi_divul"}, + {RTLIB::UDIV_I64, "__mspabi_divull"}, + {RTLIB::SREM_I16, "__mspabi_remi"}, + {RTLIB::SREM_I32, "__mspabi_remli"}, + {RTLIB::SREM_I64, "__mspabi_remlli"}, + {RTLIB::UREM_I16, "__mspabi_remu"}, + {RTLIB::UREM_I32, "__mspabi_remul"}, + {RTLIB::UREM_I64, "__mspabi_remull"}, + + // Bitwise Operations - EABI Table 10 + // TODO: __mspabi_[srli/srai/slli] ARE implemented in libgcc + {RTLIB::SRL_I32, "__mspabi_srll"}, + {RTLIB::SRA_I32, "__mspabi_sral"}, + {RTLIB::SHL_I32, "__mspabi_slll"}, + // __mspabi_[srlll/srall/sllll/rlli/rlll] are NOT implemented in libgcc + }; + + for (const auto &LC : LibraryCalls) + Info.setLibcallName(LC.Op, LC.Name); + + // Several of the runtime library functions use a special calling conv + Info.setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::MSP430_BUILTIN); + Info.setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::MSP430_BUILTIN); + Info.setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::MSP430_BUILTIN); + Info.setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::MSP430_BUILTIN); + Info.setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::MSP430_BUILTIN); + Info.setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::MSP430_BUILTIN); + Info.setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::MSP430_BUILTIN); + Info.setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::MSP430_BUILTIN); + Info.setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::MSP430_BUILTIN); + Info.setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::MSP430_BUILTIN); + Info.setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::MSP430_BUILTIN); + Info.setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::MSP430_BUILTIN); + Info.setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::MSP430_BUILTIN); + Info.setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::MSP430_BUILTIN); + + // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll +} + /// Set default libcall names. If a target wants to opt-out of a libcall it /// should be placed here. void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) { @@ -448,4 +565,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) { else setLibcallName(RTLIB::SQRT_F32, "__hexagon_sqrtf"); } + + if (TT.getArch() == Triple::ArchType::msp430) + setMSP430Libcalls(*this, TT); } diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp index 8c55f77d062b7..c2946de838d77 100644 --- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -148,103 +148,6 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VACOPY, MVT::Other, Expand); setOperationAction(ISD::JumpTable, MVT::i16, Custom); - // EABI Libcalls - EABI Section 6.2 - const struct { - const RTLIB::Libcall Op; - const char *const Name; - } LibraryCalls[] = { - // Floating point conversions - EABI Table 6 - {RTLIB::FPROUND_F64_F32, "__mspabi_cvtdf"}, - {RTLIB::FPEXT_F32_F64, "__mspabi_cvtfd"}, - // The following is NOT implemented in libgcc - //{ RTLIB::FPTOSINT_F64_I16, "__mspabi_fixdi" }, - {RTLIB::FPTOSINT_F64_I32, "__mspabi_fixdli"}, - {RTLIB::FPTOSINT_F64_I64, "__mspabi_fixdlli"}, - // The following is NOT implemented in libgcc - //{ RTLIB::FPTOUINT_F64_I16, "__mspabi_fixdu" }, - {RTLIB::FPTOUINT_F64_I32, "__mspabi_fixdul"}, - {RTLIB::FPTOUINT_F64_I64, "__mspabi_fixdull"}, - // The following is NOT implemented in libgcc - //{ RTLIB::FPTOSINT_F32_I16, "__mspabi_fixfi" }, - {RTLIB::FPTOSINT_F32_I32, "__mspabi_fixfli"}, - {RTLIB::FPTOSINT_F32_I64, "__mspabi_fixflli"}, - // The following is NOT implemented in libgcc - //{ RTLIB::FPTOUINT_F32_I16, "__mspabi_fixfu" }, - {RTLIB::FPTOUINT_F32_I32, "__mspabi_fixful"}, - {RTLIB::FPTOUINT_F32_I64, "__mspabi_fixfull"}, - // TODO The following IS implemented in libgcc - //{ RTLIB::SINTTOFP_I16_F64, "__mspabi_fltid" }, - {RTLIB::SINTTOFP_I32_F64, "__mspabi_fltlid"}, - // TODO The following IS implemented in libgcc but is not in the EABI - {RTLIB::SINTTOFP_I64_F64, "__mspabi_fltllid"}, - // TODO The following IS implemented in libgcc - //{ RTLIB::UINTTOFP_I16_F64, "__mspabi_fltud" }, - {RTLIB::UINTTOFP_I32_F64, "__mspabi_fltuld"}, - // The following IS implemented in libgcc but is not in the EABI - {RTLIB::UINTTOFP_I64_F64, "__mspabi_fltulld"}, - // TODO The following IS implemented in libgcc - //{ RTLIB::SINTTOFP_I16_F32, "__mspabi_fltif" }, - {RTLIB::SINTTOFP_I32_F32, "__mspabi_fltlif"}, - // TODO The following IS implemented in libgcc but is not in the EABI - {RTLIB::SINTTOFP_I64_F32, "__mspabi_fltllif"}, - // TODO The following IS implemented in libgcc - //{ RTLIB::UINTTOFP_I16_F32, "__mspabi_fltuf" }, - {RTLIB::UINTTOFP_I32_F32, "__mspabi_fltulf"}, - // The following IS implemented in libgcc but is not in the EABI - {RTLIB::UINTTOFP_I64_F32, "__mspabi_fltullf"}, - - // Floating point comparisons - EABI Table 7 - {RTLIB::OEQ_F64, "__mspabi_cmpd"}, - {RTLIB::UNE_F64, "__mspabi_cmpd"}, - {RTLIB::OGE_F64, "__mspabi_cmpd"}, - {RTLIB::OLT_F64, "__mspabi_cmpd"}, - {RTLIB::OLE_F64, "__mspabi_cmpd"}, - {RTLIB::OGT_F64, "__mspabi_cmpd"}, - {RTLIB::OEQ_F32, "__mspabi_cmpf"}, - {RTLIB::UNE_F32, "__mspabi_cmpf"}, - {RTLIB::OGE_F32, "__mspabi_cmpf"}, - {RTLIB::OLT_F32, "__mspabi_cmpf"}, - {RTLIB::OLE_F32, "__mspabi_cmpf"}, - {RTLIB::OGT_F32, "__mspabi_cmpf"}, - - // Floating point arithmetic - EABI Table 8 - {RTLIB::ADD_F64, "__mspabi_addd"}, - {RTLIB::ADD_F32, "__mspabi_addf"}, - {RTLIB::DIV_F64, "__mspabi_divd"}, - {RTLIB::DIV_F32, "__mspabi_divf"}, - {RTLIB::MUL_F64, "__mspabi_mpyd"}, - {RTLIB::MUL_F32, "__mspabi_mpyf"}, - {RTLIB::SUB_F64, "__mspabi_subd"}, - {RTLIB::SUB_F32, "__mspabi_subf"}, - // The following are NOT implemented in libgcc - // { RTLIB::NEG_F64, "__mspabi_negd" }, - // { RTLIB::NEG_F32, "__mspabi_negf" }, - - // Universal Integer Operations - EABI Table 9 - {RTLIB::SDIV_I16, "__mspabi_divi"}, - {RTLIB::SDIV_I32, "__mspabi_divli"}, - {RTLIB::SDIV_I64, "__mspabi_divlli"}, - {RTLIB::UDIV_I16, "__mspabi_divu"}, - {RTLIB::UDIV_I32, "__mspabi_divul"}, - {RTLIB::UDIV_I64, "__mspabi_divull"}, - {RTLIB::SREM_I16, "__mspabi_remi"}, - {RTLIB::SREM_I32, "__mspabi_remli"}, - {RTLIB::SREM_I64, "__mspabi_remlli"}, - {RTLIB::UREM_I16, "__mspabi_remu"}, - {RTLIB::UREM_I32, "__mspabi_remul"}, - {RTLIB::UREM_I64, "__mspabi_remull"}, - - // Bitwise Operations - EABI Table 10 - // TODO: __mspabi_[srli/srai/slli] ARE implemented in libgcc - {RTLIB::SRL_I32, "__mspabi_srll"}, - {RTLIB::SRA_I32, "__mspabi_sral"}, - {RTLIB::SHL_I32, "__mspabi_slll"}, - // __mspabi_[srlll/srall/sllll/rlli/rlll] are NOT implemented in libgcc - }; - - for (const auto &LC : LibraryCalls) - setLibcallName(LC.Op, LC.Name); - if (STI.hasHWMult16()) { const struct { const RTLIB::Libcall Op; @@ -308,23 +211,6 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::MSP430_BUILTIN); } - // Several of the runtime library functions use a special calling conv - setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::MSP430_BUILTIN); - setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::MSP430_BUILTIN); - setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::MSP430_BUILTIN); - setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::MSP430_BUILTIN); - setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::MSP430_BUILTIN); - setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::MSP430_BUILTIN); - setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::MSP430_BUILTIN); - setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::MSP430_BUILTIN); - setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::MSP430_BUILTIN); - setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::MSP430_BUILTIN); - setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::MSP430_BUILTIN); - setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::MSP430_BUILTIN); - setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::MSP430_BUILTIN); - setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::MSP430_BUILTIN); - // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll - setMinFunctionAlignment(Align(2)); setPrefFunctionAlignment(Align(2)); setMaxAtomicSizeInBitsSupported(0); From 33fee564998598a52e802292db25c0ee52f7e1a6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= Date: Wed, 11 Jun 2025 14:22:45 +0200 Subject: [PATCH 067/851] [HLSL][SPIR-V] Change SPV AS map for groupshared (#143519) The previous mapping we setting the hlsl_groupshared AS to 0, which translated to either Generic or Function. Changing this to 3, which translated to Workgroup. Related to #142804 --- clang/lib/Basic/Targets/SPIR.h | 4 ++-- clang/test/CodeGenHLSL/group_shared.hlsl | 4 ++++ 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h index 0eaf82eee756b..b416a01f0f374 100644 --- a/clang/lib/Basic/Targets/SPIR.h +++ b/clang/lib/Basic/Targets/SPIR.h @@ -46,7 +46,7 @@ static const unsigned SPIRDefIsPrivMap[] = { 0, // ptr32_sptr 0, // ptr32_uptr 0, // ptr64 - 0, // hlsl_groupshared + 3, // hlsl_groupshared 12, // hlsl_constant 10, // hlsl_private 11, // hlsl_device @@ -82,7 +82,7 @@ static const unsigned SPIRDefIsGenMap[] = { 0, // ptr32_sptr 0, // ptr32_uptr 0, // ptr64 - 0, // hlsl_groupshared + 3, // hlsl_groupshared 0, // hlsl_constant 10, // hlsl_private 11, // hlsl_device diff --git a/clang/test/CodeGenHLSL/group_shared.hlsl b/clang/test/CodeGenHLSL/group_shared.hlsl index 4b2e2beba4f12..a562e75b34881 100644 --- a/clang/test/CodeGenHLSL/group_shared.hlsl +++ b/clang/test/CodeGenHLSL/group_shared.hlsl @@ -3,6 +3,10 @@ // RUN: dxil-pc-shadermodel6.3-library %s \ // RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s +// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -x hlsl -triple \ +// RUN: spirv-unknown-vulkan1.3-compute %s \ +// RUN: -emit-llvm -disable-llvm-passes -o - | FileCheck %s + // Make sure groupshared translated into address space 3. // CHECK:@a = addrspace(3) global [10 x float] From 50f534e21cfb47aaf44e1613f71b56cca55ba395 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= Date: Wed, 11 Jun 2025 14:22:54 +0200 Subject: [PATCH 068/851] [HLSL][SPIR-V] Handle SV_Position builtin in PS (#141759) This commit is using the same mechanism as vk::ext_builtin_input to implement the SV_Position semantic input. The HLSL signature is not yet ready for DXIL, hence this commit only implements the SPIR-V side. This is incomplete as it doesn't allow the semantic on hull/domain and other shaders, but it's a first step to validate the overall input/output semantic logic. Fixes https://github.com/llvm/llvm-project/issues/136969 --- clang/include/clang/Basic/Attr.td | 7 ++++ clang/include/clang/Basic/AttrDocs.td | 14 +++++++ clang/include/clang/Sema/SemaHLSL.h | 2 + clang/lib/CodeGen/CGHLSLRuntime.cpp | 42 ++++++++++++++----- clang/lib/Parse/ParseHLSL.cpp | 1 + clang/lib/Sema/SemaDeclAttr.cpp | 3 ++ clang/lib/Sema/SemaHLSL.cpp | 27 ++++++++++++ .../CodeGenHLSL/semantics/SV_Position.ps.hlsl | 10 +++++ .../test/SemaHLSL/Semantics/position.ps.hlsl | 7 ++++ .../SemaHLSL/Semantics/position.ps.size.hlsl | 10 +++++ .../test/SemaHLSL/Semantics/position.vs.hlsl | 6 +++ 11 files changed, 119 insertions(+), 10 deletions(-) create mode 100644 clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl create mode 100644 clang/test/SemaHLSL/Semantics/position.ps.hlsl create mode 100644 clang/test/SemaHLSL/Semantics/position.ps.size.hlsl create mode 100644 clang/test/SemaHLSL/Semantics/position.vs.hlsl diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index b8e5806d3c5e9..9e84462eaa660 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4901,6 +4901,13 @@ def HLSLResourceBinding: InheritableAttr { }]; } +def HLSLSV_Position : HLSLAnnotationAttr { + let Spellings = [HLSLAnnotation<"sv_position">]; + let Subjects = SubjectList<[ParmVar, Field]>; + let LangOpts = [HLSL]; + let Documentation = [HLSLSV_PositionDocs]; +} + def HLSLPackOffset: HLSLAnnotationAttr { let Spellings = [HLSLAnnotation<"packoffset">]; let LangOpts = [HLSL]; diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td index ea3c43f38d9fe..047f51ffa59ed 100644 --- a/clang/include/clang/Basic/AttrDocs.td +++ b/clang/include/clang/Basic/AttrDocs.td @@ -8529,6 +8529,20 @@ The full documentation is available here: https://docs.microsoft.com/en-us/windo }]; } +def HLSLSV_PositionDocs : Documentation { + let Category = DocCatFunction; + let Content = [{ +The ``SV_Position`` semantic, when applied to an input parameter in a pixel +shader, contains the location of the pixel center (x, y) in screen space. +This semantic can be applied to the parameter, or a field in a struct used +as an input parameter. +This attribute is supported as an input in pixel, hull, domain and mesh shaders. +This attribute is supported as an output in vertex, geometry and domain shaders. + +The full documentation is available here: https://docs.microsoft.com/en-us/windows/win32/direct3dhlsl/dx-graphics-hlsl-semantics + }]; +} + def HLSLGroupSharedAddressSpaceDocs : Documentation { let Category = DocCatVariable; let Content = [{ diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h index 66d09f49680be..ba5f06f93dc30 100644 --- a/clang/include/clang/Sema/SemaHLSL.h +++ b/clang/include/clang/Sema/SemaHLSL.h @@ -125,6 +125,7 @@ class SemaHLSL : public SemaBase { void handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL); void handleSV_GroupThreadIDAttr(Decl *D, const ParsedAttr &AL); void handleSV_GroupIDAttr(Decl *D, const ParsedAttr &AL); + void handleSV_PositionAttr(Decl *D, const ParsedAttr &AL); void handlePackOffsetAttr(Decl *D, const ParsedAttr &AL); void handleShaderAttr(Decl *D, const ParsedAttr &AL); void handleResourceBindingAttr(Decl *D, const ParsedAttr &AL); @@ -146,6 +147,7 @@ class SemaHLSL : public SemaBase { // Diagnose whether the input ID is uint/unit2/uint3 type. bool diagnoseInputIDType(QualType T, const ParsedAttr &AL); + bool diagnosePositionType(QualType T, const ParsedAttr &AL); bool CanPerformScalarCast(QualType SrcTy, QualType DestTy); bool ContainsBitField(QualType BaseTy); diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp index 6d267e6164845..720dac8383c05 100644 --- a/clang/lib/CodeGen/CGHLSLRuntime.cpp +++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp @@ -384,6 +384,30 @@ static Value *buildVectorInput(IRBuilder<> &B, Function *F, llvm::Type *Ty) { return B.CreateCall(F, {B.getInt32(0)}); } +static void addSPIRVBuiltinDecoration(llvm::GlobalVariable *GV, + unsigned BuiltIn) { + LLVMContext &Ctx = GV->getContext(); + IRBuilder<> B(GV->getContext()); + MDNode *Operands = MDNode::get( + Ctx, + {ConstantAsMetadata::get(B.getInt32(/* Spirv::Decoration::BuiltIn */ 11)), + ConstantAsMetadata::get(B.getInt32(BuiltIn))}); + MDNode *Decoration = MDNode::get(Ctx, {Operands}); + GV->addMetadata("spirv.Decorations", *Decoration); +} + +static llvm::Value *createSPIRVBuiltinLoad(IRBuilder<> &B, llvm::Module &M, + llvm::Type *Ty, const Twine &Name, + unsigned BuiltInID) { + auto *GV = new llvm::GlobalVariable( + M, Ty, /* isConstant= */ true, llvm::GlobalValue::ExternalLinkage, + /* Initializer= */ nullptr, Name, /* insertBefore= */ nullptr, + llvm::GlobalVariable::GeneralDynamicTLSModel, + /* AddressSpace */ 7, /* isExternallyInitialized= */ true); + addSPIRVBuiltinDecoration(GV, BuiltInID); + return B.CreateLoad(Ty, GV); +} + llvm::Value *CGHLSLRuntime::emitInputSemantic(IRBuilder<> &B, const ParmVarDecl &D, llvm::Type *Ty) { @@ -407,6 +431,12 @@ llvm::Value *CGHLSLRuntime::emitInputSemantic(IRBuilder<> &B, llvm::Function *GroupIDIntrinsic = CGM.getIntrinsic(getGroupIdIntrinsic()); return buildVectorInput(B, GroupIDIntrinsic, Ty); } + if (D.hasAttr()) { + if (getArch() == llvm::Triple::spirv) + return createSPIRVBuiltinLoad(B, CGM.getModule(), Ty, "sv_position", + /* BuiltIn::Position */ 0); + llvm_unreachable("SV_Position semantic not implemented for this target."); + } assert(false && "Unhandled parameter attribute"); return nullptr; } @@ -626,16 +656,8 @@ void CGHLSLRuntime::initializeBufferFromBinding(const HLSLBufferDecl *BufDecl, void CGHLSLRuntime::handleGlobalVarDefinition(const VarDecl *VD, llvm::GlobalVariable *GV) { - if (auto Attr = VD->getAttr()) { - LLVMContext &Ctx = GV->getContext(); - IRBuilder<> B(GV->getContext()); - MDNode *Operands = MDNode::get( - Ctx, {ConstantAsMetadata::get( - B.getInt32(/* Spirv::Decoration::BuiltIn */ 11)), - ConstantAsMetadata::get(B.getInt32(Attr->getBuiltIn()))}); - MDNode *Decoration = MDNode::get(Ctx, {Operands}); - GV->addMetadata("spirv.Decorations", *Decoration); - } + if (auto Attr = VD->getAttr()) + addSPIRVBuiltinDecoration(GV, Attr->getBuiltIn()); } llvm::Instruction *CGHLSLRuntime::getConvergenceToken(BasicBlock &BB) { diff --git a/clang/lib/Parse/ParseHLSL.cpp b/clang/lib/Parse/ParseHLSL.cpp index 5569605c287b1..53d46465e3362 100644 --- a/clang/lib/Parse/ParseHLSL.cpp +++ b/clang/lib/Parse/ParseHLSL.cpp @@ -289,6 +289,7 @@ void Parser::ParseHLSLAnnotations(ParsedAttributes &Attrs, case ParsedAttr::AT_HLSLSV_GroupID: case ParsedAttr::AT_HLSLSV_GroupIndex: case ParsedAttr::AT_HLSLSV_DispatchThreadID: + case ParsedAttr::AT_HLSLSV_Position: break; default: llvm_unreachable("invalid HLSL Annotation"); diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp index 6360827f415b8..1aeae41042a1c 100644 --- a/clang/lib/Sema/SemaDeclAttr.cpp +++ b/clang/lib/Sema/SemaDeclAttr.cpp @@ -7588,6 +7588,9 @@ ProcessDeclAttribute(Sema &S, Scope *scope, Decl *D, const ParsedAttr &AL, case ParsedAttr::AT_HLSLWaveSize: S.HLSL().handleWaveSizeAttr(D, AL); break; + case ParsedAttr::AT_HLSLSV_Position: + S.HLSL().handleSV_PositionAttr(D, AL); + break; case ParsedAttr::AT_HLSLVkExtBuiltinInput: S.HLSL().handleVkExtBuiltinInputAttr(D, AL); break; diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp index 9065cc5a1d4a5..ba491b6134293 100644 --- a/clang/lib/Sema/SemaHLSL.cpp +++ b/clang/lib/Sema/SemaHLSL.cpp @@ -764,6 +764,13 @@ void SemaHLSL::CheckSemanticAnnotation( return; DiagnoseAttrStageMismatch(AnnotationAttr, ST, {llvm::Triple::Compute}); break; + case attr::HLSLSV_Position: + // TODO(#143523): allow use on other shader types & output once the overall + // semantic logic is implemented. + if (ST == llvm::Triple::Pixel) + return; + DiagnoseAttrStageMismatch(AnnotationAttr, ST, {llvm::Triple::Pixel}); + break; default: llvm_unreachable("Unknown HLSLAnnotationAttr"); } @@ -1147,6 +1154,26 @@ void SemaHLSL::handleSV_DispatchThreadIDAttr(Decl *D, const ParsedAttr &AL) { HLSLSV_DispatchThreadIDAttr(getASTContext(), AL)); } +bool SemaHLSL::diagnosePositionType(QualType T, const ParsedAttr &AL) { + const auto *VT = T->getAs(); + + if (!T->hasFloatingRepresentation() || (VT && VT->getNumElements() > 4)) { + Diag(AL.getLoc(), diag::err_hlsl_attr_invalid_type) + << AL << "float/float1/float2/float3/float4"; + return false; + } + + return true; +} + +void SemaHLSL::handleSV_PositionAttr(Decl *D, const ParsedAttr &AL) { + auto *VD = cast(D); + if (!diagnosePositionType(VD->getType(), AL)) + return; + + D->addAttr(::new (getASTContext()) HLSLSV_PositionAttr(getASTContext(), AL)); +} + void SemaHLSL::handleSV_GroupThreadIDAttr(Decl *D, const ParsedAttr &AL) { auto *VD = cast(D); if (!diagnoseInputIDType(VD->getType(), AL)) diff --git a/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl b/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl new file mode 100644 index 0000000000000..58b91fc9264dd --- /dev/null +++ b/clang/test/CodeGenHLSL/semantics/SV_Position.ps.hlsl @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-pixel -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s + +// CHECK: @sv_position = external thread_local addrspace(7) externally_initialized constant <4 x float>, !spirv.Decorations !0 + +// CHECK: define void @main() {{.*}} { +float4 main(float4 p : SV_Position) { + // CHECK: %[[#P:]] = load <4 x float>, ptr addrspace(7) @sv_position, align 16 + // CHECK: %[[#R:]] = call spir_func <4 x float> @_Z4mainDv4_f(<4 x float> %[[#P]]) + return p; +} diff --git a/clang/test/SemaHLSL/Semantics/position.ps.hlsl b/clang/test/SemaHLSL/Semantics/position.ps.hlsl new file mode 100644 index 0000000000000..32bc5f55b2abd --- /dev/null +++ b/clang/test/SemaHLSL/Semantics/position.ps.hlsl @@ -0,0 +1,7 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-pixel -x hlsl -finclude-default-header -o - %s -ast-dump | FileCheck %s + +float4 main(float4 a : SV_Position) { +// CHECK: FunctionDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-1]]:8 main 'float4 (float4)' +// CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:20 a 'float4':'vector' +// CHECK-NEXT: HLSLSV_PositionAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> +} diff --git a/clang/test/SemaHLSL/Semantics/position.ps.size.hlsl b/clang/test/SemaHLSL/Semantics/position.ps.size.hlsl new file mode 100644 index 0000000000000..124d401a9990c --- /dev/null +++ b/clang/test/SemaHLSL/Semantics/position.ps.size.hlsl @@ -0,0 +1,10 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -finclude-default-header -o - %s -verify -verify-ignore-unexpected +// RUN: %clang_cc1 -triple spirv-unknown-vulkan1.3-library -x hlsl -finclude-default-header -o - %s -verify -verify-ignore-unexpected + +// expected-error@+1 {{attribute 'SV_Position' only applies to a field or parameter of type 'float/float1/float2/float3/float4'}} +void main(vector a : SV_Position) { +} + +// expected-error@+1 {{attribute 'SV_Position' only applies to a field or parameter of type 'float/float1/float2/float3/float4'}} +void main(int2 a : SV_Position) { +} diff --git a/clang/test/SemaHLSL/Semantics/position.vs.hlsl b/clang/test/SemaHLSL/Semantics/position.vs.hlsl new file mode 100644 index 0000000000000..19f781fa3757c --- /dev/null +++ b/clang/test/SemaHLSL/Semantics/position.vs.hlsl @@ -0,0 +1,6 @@ +// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-vertex -x hlsl -finclude-default-header -o - %s -verify + +// expected-error@+1 {{attribute 'SV_Position' is unsupported in 'vertex' shaders, requires pixel}} +float4 main(float4 a : SV_Position) { + return a; +} From b49c7896c0a31ca618098b52a28eb87dff625b8f Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 11 Jun 2025 14:27:48 +0200 Subject: [PATCH 069/851] [libc++] Fix constraints in `__countr_zero` and `__popcount` Currently these two functions are constrained on `is_unsigned`, which is more permissive than what is required by the standard for their public counterparts. This fixes the constraints to match the public functions by using `__libcpp_is_unsigned_integer` instead. --- libcxx/include/__bit/countr.h | 4 ++-- libcxx/include/__bit/popcount.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libcxx/include/__bit/countr.h b/libcxx/include/__bit/countr.h index 7b311b83853c5..1589f57c47385 100644 --- a/libcxx/include/__bit/countr.h +++ b/libcxx/include/__bit/countr.h @@ -11,7 +11,7 @@ #include <__concepts/arithmetic.h> #include <__config> -#include <__type_traits/is_unsigned.h> +#include <__type_traits/is_unsigned_integer.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -25,7 +25,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __countr_zero(_Tp __t) _NOEXCEPT { - static_assert(is_unsigned<_Tp>::value, "__countr_zero only works with unsigned types"); + static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_zero only works with unsigned types"); return __builtin_ctzg(__t, numeric_limits<_Tp>::digits); } diff --git a/libcxx/include/__bit/popcount.h b/libcxx/include/__bit/popcount.h index 9ae572d466ba7..4be0e418e7aa6 100644 --- a/libcxx/include/__bit/popcount.h +++ b/libcxx/include/__bit/popcount.h @@ -11,7 +11,7 @@ #include <__concepts/arithmetic.h> #include <__config> -#include <__type_traits/is_unsigned.h> +#include <__type_traits/is_unsigned_integer.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -24,7 +24,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __popcount(_Tp __t) _NOEXCEPT { - static_assert(is_unsigned<_Tp>::value, "__popcount only works with unsigned types"); + static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__popcount only works with unsigned types"); return __builtin_popcountg(__t); } From 3c56437eafee95f368feb20d28b74c29504b833d Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 11 Jun 2025 14:31:13 +0200 Subject: [PATCH 070/851] [libc++] Refactor signed/unsigned integer traits (#142750) This patch does a few things: - `__libcpp_is_signed_integer` and `__libcpp_is_unsigned_integer` are refactored to be variable templates instead of class templates. - the two traits are merged into a single header `<__type_traits/integer_traits.h>`. - `__libcpp_signed_integer`, `__libcpp_unsigned_integer` and `__libcpp_integer` are moved into the same header. - The above mentioned concepts are renamed to `__signed_integer`, `__unsigned_integer` and `__signed_or_unsigned_integer` respectively. --- libcxx/include/CMakeLists.txt | 3 +- libcxx/include/__bit/bit_ceil.h | 4 +- libcxx/include/__bit/bit_floor.h | 4 +- libcxx/include/__bit/bit_log2.h | 4 +- libcxx/include/__bit/bit_width.h | 4 +- libcxx/include/__bit/countl.h | 9 +-- libcxx/include/__bit/countr.h | 9 +-- libcxx/include/__bit/has_single_bit.h | 4 +- libcxx/include/__bit/popcount.h | 7 +- libcxx/include/__bit/rotate.h | 11 ++- libcxx/include/__concepts/arithmetic.h | 13 ---- libcxx/include/__format/format_arg_store.h | 6 +- libcxx/include/__mdspan/extents.h | 6 +- .../include/__numeric/saturation_arithmetic.h | 30 ++++---- libcxx/include/__type_traits/integer_traits.h | 73 +++++++++++++++++++ .../include/__type_traits/is_signed_integer.h | 35 --------- .../__type_traits/is_unsigned_integer.h | 35 --------- libcxx/include/__utility/cmp.h | 16 ++-- libcxx/include/module.modulemap.in | 9 +-- .../__libcpp_integer.compile.pass.cpp | 62 ++++++++-------- .../__libcpp_signed_integer.compile.pass.cpp | 62 ++++++++-------- ...__libcpp_unsigned_integer.compile.pass.cpp | 62 ++++++++-------- 22 files changed, 223 insertions(+), 245 deletions(-) create mode 100644 libcxx/include/__type_traits/integer_traits.h delete mode 100644 libcxx/include/__type_traits/is_signed_integer.h delete mode 100644 libcxx/include/__type_traits/is_unsigned_integer.h diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt index 52611e43968bc..8931a1b35f6d3 100644 --- a/libcxx/include/CMakeLists.txt +++ b/libcxx/include/CMakeLists.txt @@ -800,6 +800,7 @@ set(files __type_traits/extent.h __type_traits/has_unique_object_representation.h __type_traits/has_virtual_destructor.h + __type_traits/integer_traits.h __type_traits/integral_constant.h __type_traits/invoke.h __type_traits/is_abstract.h @@ -850,7 +851,6 @@ set(files __type_traits/is_same.h __type_traits/is_scalar.h __type_traits/is_signed.h - __type_traits/is_signed_integer.h __type_traits/is_specialization.h __type_traits/is_standard_layout.h __type_traits/is_swappable.h @@ -864,7 +864,6 @@ set(files __type_traits/is_unbounded_array.h __type_traits/is_union.h __type_traits/is_unsigned.h - __type_traits/is_unsigned_integer.h __type_traits/is_valid_expansion.h __type_traits/is_void.h __type_traits/is_volatile.h diff --git a/libcxx/include/__bit/bit_ceil.h b/libcxx/include/__bit/bit_ceil.h index cfd792dc2e2ad..99881a8538290 100644 --- a/libcxx/include/__bit/bit_ceil.h +++ b/libcxx/include/__bit/bit_ceil.h @@ -11,8 +11,8 @@ #include <__assert> #include <__bit/countl.h> -#include <__concepts/arithmetic.h> #include <__config> +#include <__type_traits/integer_traits.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -41,7 +41,7 @@ template # if _LIBCPP_STD_VER >= 20 -template <__libcpp_unsigned_integer _Tp> +template <__unsigned_integer _Tp> [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp bit_ceil(_Tp __t) noexcept { return std::__bit_ceil(__t); } diff --git a/libcxx/include/__bit/bit_floor.h b/libcxx/include/__bit/bit_floor.h index 6bcbc53fb4972..799a064130b4b 100644 --- a/libcxx/include/__bit/bit_floor.h +++ b/libcxx/include/__bit/bit_floor.h @@ -10,8 +10,8 @@ #define _LIBCPP___BIT_BIT_FLOOR_H #include <__bit/bit_log2.h> -#include <__concepts/arithmetic.h> #include <__config> +#include <__type_traits/integer_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 -template <__libcpp_unsigned_integer _Tp> +template <__unsigned_integer _Tp> [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp bit_floor(_Tp __t) noexcept { return __t == 0 ? 0 : _Tp{1} << std::__bit_log2(__t); } diff --git a/libcxx/include/__bit/bit_log2.h b/libcxx/include/__bit/bit_log2.h index b22e1ce1f84e6..8077cd91d6fd7 100644 --- a/libcxx/include/__bit/bit_log2.h +++ b/libcxx/include/__bit/bit_log2.h @@ -11,7 +11,7 @@ #include <__bit/countl.h> #include <__config> -#include <__type_traits/is_unsigned_integer.h> +#include <__type_traits/integer_traits.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -22,7 +22,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __bit_log2(_Tp __t) _NOEXCEPT { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__bit_log2 requires an unsigned integer type"); + static_assert(__is_unsigned_integer_v<_Tp>, "__bit_log2 requires an unsigned integer type"); return numeric_limits<_Tp>::digits - 1 - std::__countl_zero(__t); } diff --git a/libcxx/include/__bit/bit_width.h b/libcxx/include/__bit/bit_width.h index 853e481776f7d..75050acabbe88 100644 --- a/libcxx/include/__bit/bit_width.h +++ b/libcxx/include/__bit/bit_width.h @@ -10,8 +10,8 @@ #define _LIBCPP___BIT_BIT_WIDTH_H #include <__bit/bit_log2.h> -#include <__concepts/arithmetic.h> #include <__config> +#include <__type_traits/integer_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD -template <__libcpp_unsigned_integer _Tp> +template <__unsigned_integer _Tp> [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int bit_width(_Tp __t) noexcept { return __t == 0 ? 0 : std::__bit_log2(__t) + 1; } diff --git a/libcxx/include/__bit/countl.h b/libcxx/include/__bit/countl.h index 9499bf9b458ee..075914020879a 100644 --- a/libcxx/include/__bit/countl.h +++ b/libcxx/include/__bit/countl.h @@ -9,9 +9,8 @@ #ifndef _LIBCPP___BIT_COUNTL_H #define _LIBCPP___BIT_COUNTL_H -#include <__concepts/arithmetic.h> #include <__config> -#include <__type_traits/is_unsigned_integer.h> +#include <__type_traits/integer_traits.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -25,18 +24,18 @@ _LIBCPP_BEGIN_NAMESPACE_STD template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 int __countl_zero(_Tp __t) _NOEXCEPT { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countl_zero requires an unsigned integer type"); + static_assert(__is_unsigned_integer_v<_Tp>, "__countl_zero requires an unsigned integer type"); return __builtin_clzg(__t, numeric_limits<_Tp>::digits); } #if _LIBCPP_STD_VER >= 20 -template <__libcpp_unsigned_integer _Tp> +template <__unsigned_integer _Tp> [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int countl_zero(_Tp __t) noexcept { return std::__countl_zero(__t); } -template <__libcpp_unsigned_integer _Tp> +template <__unsigned_integer _Tp> [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int countl_one(_Tp __t) noexcept { return __t != numeric_limits<_Tp>::max() ? std::countl_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits; } diff --git a/libcxx/include/__bit/countr.h b/libcxx/include/__bit/countr.h index 1589f57c47385..f6c98695d3d06 100644 --- a/libcxx/include/__bit/countr.h +++ b/libcxx/include/__bit/countr.h @@ -9,9 +9,8 @@ #ifndef _LIBCPP___BIT_COUNTR_H #define _LIBCPP___BIT_COUNTR_H -#include <__concepts/arithmetic.h> #include <__config> -#include <__type_traits/is_unsigned_integer.h> +#include <__type_traits/integer_traits.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -25,18 +24,18 @@ _LIBCPP_BEGIN_NAMESPACE_STD template [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __countr_zero(_Tp __t) _NOEXCEPT { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__countr_zero only works with unsigned types"); + static_assert(__is_unsigned_integer_v<_Tp>, "__countr_zero only works with unsigned types"); return __builtin_ctzg(__t, numeric_limits<_Tp>::digits); } #if _LIBCPP_STD_VER >= 20 -template <__libcpp_unsigned_integer _Tp> +template <__unsigned_integer _Tp> [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int countr_zero(_Tp __t) noexcept { return std::__countr_zero(__t); } -template <__libcpp_unsigned_integer _Tp> +template <__unsigned_integer _Tp> [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int countr_one(_Tp __t) noexcept { return __t != numeric_limits<_Tp>::max() ? std::countr_zero(static_cast<_Tp>(~__t)) : numeric_limits<_Tp>::digits; } diff --git a/libcxx/include/__bit/has_single_bit.h b/libcxx/include/__bit/has_single_bit.h index 52f5853a1bc8a..b43e69323e77b 100644 --- a/libcxx/include/__bit/has_single_bit.h +++ b/libcxx/include/__bit/has_single_bit.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___BIT_HAS_SINGLE_BIT_H #define _LIBCPP___BIT_HAS_SINGLE_BIT_H -#include <__concepts/arithmetic.h> #include <__config> +#include <__type_traits/integer_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -23,7 +23,7 @@ _LIBCPP_PUSH_MACROS _LIBCPP_BEGIN_NAMESPACE_STD -template <__libcpp_unsigned_integer _Tp> +template <__unsigned_integer _Tp> [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr bool has_single_bit(_Tp __t) noexcept { return __t != 0 && (((__t & (__t - 1)) == 0)); } diff --git a/libcxx/include/__bit/popcount.h b/libcxx/include/__bit/popcount.h index 4be0e418e7aa6..8d9ba09938482 100644 --- a/libcxx/include/__bit/popcount.h +++ b/libcxx/include/__bit/popcount.h @@ -9,9 +9,8 @@ #ifndef _LIBCPP___BIT_POPCOUNT_H #define _LIBCPP___BIT_POPCOUNT_H -#include <__concepts/arithmetic.h> #include <__config> -#include <__type_traits/is_unsigned_integer.h> +#include <__type_traits/integer_traits.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -24,13 +23,13 @@ _LIBCPP_BEGIN_NAMESPACE_STD template [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR int __popcount(_Tp __t) _NOEXCEPT { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__popcount only works with unsigned types"); + static_assert(__is_unsigned_integer_v<_Tp>, "__popcount only works with unsigned types"); return __builtin_popcountg(__t); } #if _LIBCPP_STD_VER >= 20 -template <__libcpp_unsigned_integer _Tp> +template <__unsigned_integer _Tp> [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr int popcount(_Tp __t) noexcept { return std::__popcount(__t); } diff --git a/libcxx/include/__bit/rotate.h b/libcxx/include/__bit/rotate.h index d79d98de296aa..c6f34bdaf6e63 100644 --- a/libcxx/include/__bit/rotate.h +++ b/libcxx/include/__bit/rotate.h @@ -9,9 +9,8 @@ #ifndef _LIBCPP___BIT_ROTATE_H #define _LIBCPP___BIT_ROTATE_H -#include <__concepts/arithmetic.h> #include <__config> -#include <__type_traits/is_unsigned_integer.h> +#include <__type_traits/integer_traits.h> #include #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) @@ -25,7 +24,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD // the rotr function becomes the ROR instruction. template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __rotl(_Tp __x, int __s) _NOEXCEPT { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotl requires an unsigned integer type"); + static_assert(__is_unsigned_integer_v<_Tp>, "__rotl requires an unsigned integer type"); const int __n = numeric_limits<_Tp>::digits; int __r = __s % __n; @@ -40,7 +39,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __rotl(_Tp __x, int __s) template _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __rotr(_Tp __x, int __s) _NOEXCEPT { - static_assert(__libcpp_is_unsigned_integer<_Tp>::value, "__rotr requires an unsigned integer type"); + static_assert(__is_unsigned_integer_v<_Tp>, "__rotr requires an unsigned integer type"); const int __n = numeric_limits<_Tp>::digits; int __r = __s % __n; @@ -55,12 +54,12 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp __rotr(_Tp __x, int __s) #if _LIBCPP_STD_VER >= 20 -template <__libcpp_unsigned_integer _Tp> +template <__unsigned_integer _Tp> [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp rotl(_Tp __t, int __cnt) noexcept { return std::__rotl(__t, __cnt); } -template <__libcpp_unsigned_integer _Tp> +template <__unsigned_integer _Tp> [[nodiscard]] _LIBCPP_HIDE_FROM_ABI constexpr _Tp rotr(_Tp __t, int __cnt) noexcept { return std::__rotr(__t, __cnt); } diff --git a/libcxx/include/__concepts/arithmetic.h b/libcxx/include/__concepts/arithmetic.h index 0c44f117805f3..64c0200783df7 100644 --- a/libcxx/include/__concepts/arithmetic.h +++ b/libcxx/include/__concepts/arithmetic.h @@ -13,8 +13,6 @@ #include <__type_traits/is_floating_point.h> #include <__type_traits/is_integral.h> #include <__type_traits/is_signed.h> -#include <__type_traits/is_signed_integer.h> -#include <__type_traits/is_unsigned_integer.h> #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) # pragma GCC system_header @@ -38,17 +36,6 @@ concept unsigned_integral = integral<_Tp> && !signed_integral<_Tp>; template concept floating_point = is_floating_point_v<_Tp>; -// Concept helpers for the internal type traits for the fundamental types. - -template -concept __libcpp_unsigned_integer = __libcpp_is_unsigned_integer<_Tp>::value; - -template -concept __libcpp_signed_integer = __libcpp_is_signed_integer<_Tp>::value; - -template -concept __libcpp_integer = __libcpp_unsigned_integer<_Tp> || __libcpp_signed_integer<_Tp>; - #endif // _LIBCPP_STD_VER >= 20 _LIBCPP_END_NAMESPACE_STD diff --git a/libcxx/include/__format/format_arg_store.h b/libcxx/include/__format/format_arg_store.h index 87557aa4da7bb..fbb4cad21b232 100644 --- a/libcxx/include/__format/format_arg_store.h +++ b/libcxx/include/__format/format_arg_store.h @@ -14,7 +14,6 @@ # pragma GCC system_header #endif -#include <__concepts/arithmetic.h> #include <__concepts/same_as.h> #include <__config> #include <__cstddef/size_t.h> @@ -22,6 +21,7 @@ #include <__format/format_arg.h> #include <__type_traits/conditional.h> #include <__type_traits/extent.h> +#include <__type_traits/integer_traits.h> #include <__type_traits/remove_const.h> #include #include @@ -65,7 +65,7 @@ consteval __arg_t __determine_arg_t() { # endif // Signed integers -template +template consteval __arg_t __determine_arg_t() { if constexpr (sizeof(_Tp) <= sizeof(int)) return __arg_t::__int; @@ -80,7 +80,7 @@ consteval __arg_t __determine_arg_t() { } // Unsigned integers -template +template consteval __arg_t __determine_arg_t() { if constexpr (sizeof(_Tp) <= sizeof(unsigned)) return __arg_t::__unsigned; diff --git a/libcxx/include/__mdspan/extents.h b/libcxx/include/__mdspan/extents.h index 00454004851d5..99b54badf893c 100644 --- a/libcxx/include/__mdspan/extents.h +++ b/libcxx/include/__mdspan/extents.h @@ -21,11 +21,10 @@ #include <__config> #include <__concepts/arithmetic.h> -#include <__cstddef/byte.h> #include <__type_traits/common_type.h> +#include <__type_traits/integer_traits.h> #include <__type_traits/is_convertible.h> #include <__type_traits/is_nothrow_constructible.h> -#include <__type_traits/is_same.h> #include <__type_traits/make_unsigned.h> #include <__utility/integer_sequence.h> #include <__utility/unreachable.h> @@ -283,7 +282,8 @@ class extents { using size_type = make_unsigned_t; using rank_type = size_t; - static_assert(__libcpp_integer, "extents::index_type must be a signed or unsigned integer type"); + static_assert(__signed_or_unsigned_integer, + "extents::index_type must be a signed or unsigned integer type"); static_assert(((__mdspan_detail::__is_representable_as(_Extents) || (_Extents == dynamic_extent)) && ...), "extents ctor: arguments must be representable as index_type and nonnegative"); diff --git a/libcxx/include/__numeric/saturation_arithmetic.h b/libcxx/include/__numeric/saturation_arithmetic.h index 4110a8cb142a5..9bd3af12c9572 100644 --- a/libcxx/include/__numeric/saturation_arithmetic.h +++ b/libcxx/include/__numeric/saturation_arithmetic.h @@ -11,9 +11,9 @@ #define _LIBCPP___NUMERIC_SATURATION_ARITHMETIC_H #include <__assert> -#include <__concepts/arithmetic.h> #include <__config> #include <__memory/addressof.h> +#include <__type_traits/integer_traits.h> #include <__utility/cmp.h> #include @@ -28,12 +28,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 -template <__libcpp_integer _Tp> +template <__signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept { if (_Tp __sum; !__builtin_add_overflow(__x, __y, std::addressof(__sum))) return __sum; // Handle overflow - if constexpr (__libcpp_unsigned_integer<_Tp>) { + if constexpr (__unsigned_integer<_Tp>) { return std::numeric_limits<_Tp>::max(); } else { // Signed addition overflow @@ -46,12 +46,12 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __add_sat(_Tp __x, _Tp __y) noexcept { } } -template <__libcpp_integer _Tp> +template <__signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept { if (_Tp __sub; !__builtin_sub_overflow(__x, __y, std::addressof(__sub))) return __sub; // Handle overflow - if constexpr (__libcpp_unsigned_integer<_Tp>) { + if constexpr (__unsigned_integer<_Tp>) { // Overflows if (x < y) return std::numeric_limits<_Tp>::min(); } else { @@ -65,12 +65,12 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __sub_sat(_Tp __x, _Tp __y) noexcept { } } -template <__libcpp_integer _Tp> +template <__signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Tp __mul_sat(_Tp __x, _Tp __y) noexcept { if (_Tp __mul; !__builtin_mul_overflow(__x, __y, std::addressof(__mul))) return __mul; // Handle overflow - if constexpr (__libcpp_unsigned_integer<_Tp>) { + if constexpr (__unsigned_integer<_Tp>) { return std::numeric_limits<_Tp>::max(); } else { // Signed multiplication overflow @@ -81,10 +81,10 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __mul_sat(_Tp __x, _Tp __y) noexcept { } } -template <__libcpp_integer _Tp> +template <__signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Tp __div_sat(_Tp __x, _Tp __y) noexcept { _LIBCPP_ASSERT_UNCATEGORIZED(__y != 0, "Division by 0 is undefined"); - if constexpr (__libcpp_unsigned_integer<_Tp>) { + if constexpr (__unsigned_integer<_Tp>) { return __x / __y; } else { // Handle signed division overflow @@ -94,7 +94,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Tp __div_sat(_Tp __x, _Tp __y) noexcept { } } -template <__libcpp_integer _Rp, __libcpp_integer _Tp> +template <__signed_or_unsigned_integer _Rp, __signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Rp __saturate_cast(_Tp __x) noexcept { // Saturation is impossible edge case when ((min _Rp) < (min _Tp) && (max _Rp) > (max _Tp)) and it is expected to be // optimized out by the compiler. @@ -112,27 +112,27 @@ _LIBCPP_HIDE_FROM_ABI constexpr _Rp __saturate_cast(_Tp __x) noexcept { #if _LIBCPP_STD_VER >= 26 -template <__libcpp_integer _Tp> +template <__signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Tp add_sat(_Tp __x, _Tp __y) noexcept { return std::__add_sat(__x, __y); } -template <__libcpp_integer _Tp> +template <__signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Tp sub_sat(_Tp __x, _Tp __y) noexcept { return std::__sub_sat(__x, __y); } -template <__libcpp_integer _Tp> +template <__signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Tp mul_sat(_Tp __x, _Tp __y) noexcept { return std::__mul_sat(__x, __y); } -template <__libcpp_integer _Tp> +template <__signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Tp div_sat(_Tp __x, _Tp __y) noexcept { return std::__div_sat(__x, __y); } -template <__libcpp_integer _Rp, __libcpp_integer _Tp> +template <__signed_or_unsigned_integer _Rp, __signed_or_unsigned_integer _Tp> _LIBCPP_HIDE_FROM_ABI constexpr _Rp saturate_cast(_Tp __x) noexcept { return std::__saturate_cast<_Rp>(__x); } diff --git a/libcxx/include/__type_traits/integer_traits.h b/libcxx/include/__type_traits/integer_traits.h new file mode 100644 index 0000000000000..fad502c44e301 --- /dev/null +++ b/libcxx/include/__type_traits/integer_traits.h @@ -0,0 +1,73 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef _LIBCPP___TYPE_TRAITS_INTEGER_TRAITS_H +#define _LIBCPP___TYPE_TRAITS_INTEGER_TRAITS_H + +#include <__config> + +#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) +# pragma GCC system_header +#endif + +_LIBCPP_BEGIN_NAMESPACE_STD + +// This trait is to determine whether a type is a /signed integer type/ +// See [basic.fundamental]/p1 +template +inline const bool __is_signed_integer_v = false; +template <> +inline const bool __is_signed_integer_v = true; +template <> +inline const bool __is_signed_integer_v = true; +template <> +inline const bool __is_signed_integer_v = true; +template <> +inline const bool __is_signed_integer_v = true; +template <> +inline const bool __is_signed_integer_v = true; +#if _LIBCPP_HAS_INT128 +template <> +inline const bool __is_signed_integer_v<__int128_t> = true; +#endif + +// This trait is to determine whether a type is an /unsigned integer type/ +// See [basic.fundamental]/p2 +template +inline const bool __is_unsigned_integer_v = false; +template <> +inline const bool __is_unsigned_integer_v = true; +template <> +inline const bool __is_unsigned_integer_v = true; +template <> +inline const bool __is_unsigned_integer_v = true; +template <> +inline const bool __is_unsigned_integer_v = true; +template <> +inline const bool __is_unsigned_integer_v = true; +#if _LIBCPP_HAS_INT128 +template <> +inline const bool __is_unsigned_integer_v<__uint128_t> = true; +#endif + +#if _LIBCPP_STD_VER >= 20 +template +concept __signed_integer = __is_signed_integer_v<_Tp>; + +template +concept __unsigned_integer = __is_unsigned_integer_v<_Tp>; + +// This isn't called __integer, because an integer type according to [basic.fundamental]/p11 is the same as an integral +// type. An integral type is _not_ the same set of types as signed and unsigned integer types combined. +template +concept __signed_or_unsigned_integer = __signed_integer<_Tp> || __unsigned_integer<_Tp>; +#endif + +_LIBCPP_END_NAMESPACE_STD + +#endif // _LIBCPP___TYPE_TRAITS_INTEGER_TRAITS_H diff --git a/libcxx/include/__type_traits/is_signed_integer.h b/libcxx/include/__type_traits/is_signed_integer.h deleted file mode 100644 index 62943902a1834..0000000000000 --- a/libcxx/include/__type_traits/is_signed_integer.h +++ /dev/null @@ -1,35 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___TYPE_TRAITS_IS_SIGNED_INTEGER_H -#define _LIBCPP___TYPE_TRAITS_IS_SIGNED_INTEGER_H - -#include <__config> -#include <__type_traits/integral_constant.h> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -_LIBCPP_BEGIN_NAMESPACE_STD - -// clang-format off -template struct __libcpp_is_signed_integer : false_type {}; -template <> struct __libcpp_is_signed_integer : true_type {}; -template <> struct __libcpp_is_signed_integer : true_type {}; -template <> struct __libcpp_is_signed_integer : true_type {}; -template <> struct __libcpp_is_signed_integer : true_type {}; -template <> struct __libcpp_is_signed_integer : true_type {}; -#if _LIBCPP_HAS_INT128 -template <> struct __libcpp_is_signed_integer<__int128_t> : true_type {}; -#endif -// clang-format on - -_LIBCPP_END_NAMESPACE_STD - -#endif // _LIBCPP___TYPE_TRAITS_IS_SIGNED_INTEGER_H diff --git a/libcxx/include/__type_traits/is_unsigned_integer.h b/libcxx/include/__type_traits/is_unsigned_integer.h deleted file mode 100644 index 74414a831e79a..0000000000000 --- a/libcxx/include/__type_traits/is_unsigned_integer.h +++ /dev/null @@ -1,35 +0,0 @@ -//===----------------------------------------------------------------------===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef _LIBCPP___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H -#define _LIBCPP___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H - -#include <__config> -#include <__type_traits/integral_constant.h> - -#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER) -# pragma GCC system_header -#endif - -_LIBCPP_BEGIN_NAMESPACE_STD - -// clang-format off -template struct __libcpp_is_unsigned_integer : false_type {}; -template <> struct __libcpp_is_unsigned_integer : true_type {}; -template <> struct __libcpp_is_unsigned_integer : true_type {}; -template <> struct __libcpp_is_unsigned_integer : true_type {}; -template <> struct __libcpp_is_unsigned_integer : true_type {}; -template <> struct __libcpp_is_unsigned_integer : true_type {}; -#if _LIBCPP_HAS_INT128 -template <> struct __libcpp_is_unsigned_integer<__uint128_t> : true_type {}; -#endif -// clang-format on - -_LIBCPP_END_NAMESPACE_STD - -#endif // _LIBCPP___TYPE_TRAITS_IS_UNSIGNED_INTEGER_H diff --git a/libcxx/include/__utility/cmp.h b/libcxx/include/__utility/cmp.h index b7c1ed614dfcb..14dc0c154c040 100644 --- a/libcxx/include/__utility/cmp.h +++ b/libcxx/include/__utility/cmp.h @@ -9,8 +9,8 @@ #ifndef _LIBCPP___UTILITY_CMP_H #define _LIBCPP___UTILITY_CMP_H -#include <__concepts/arithmetic.h> #include <__config> +#include <__type_traits/integer_traits.h> #include <__type_traits/is_signed.h> #include <__type_traits/make_unsigned.h> #include @@ -26,7 +26,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD #if _LIBCPP_STD_VER >= 20 -template <__libcpp_integer _Tp, __libcpp_integer _Up> +template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up> _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_equal(_Tp __t, _Up __u) noexcept { if constexpr (is_signed_v<_Tp> == is_signed_v<_Up>) return __t == __u; @@ -36,12 +36,12 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_equal(_Tp __t, _Up __u) noexcept { return __u < 0 ? false : __t == make_unsigned_t<_Up>(__u); } -template <__libcpp_integer _Tp, __libcpp_integer _Up> +template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up> _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_not_equal(_Tp __t, _Up __u) noexcept { return !std::cmp_equal(__t, __u); } -template <__libcpp_integer _Tp, __libcpp_integer _Up> +template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up> _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_less(_Tp __t, _Up __u) noexcept { if constexpr (is_signed_v<_Tp> == is_signed_v<_Up>) return __t < __u; @@ -51,22 +51,22 @@ _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_less(_Tp __t, _Up __u) noexcept { return __u < 0 ? false : __t < make_unsigned_t<_Up>(__u); } -template <__libcpp_integer _Tp, __libcpp_integer _Up> +template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up> _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_greater(_Tp __t, _Up __u) noexcept { return std::cmp_less(__u, __t); } -template <__libcpp_integer _Tp, __libcpp_integer _Up> +template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up> _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_less_equal(_Tp __t, _Up __u) noexcept { return !std::cmp_greater(__t, __u); } -template <__libcpp_integer _Tp, __libcpp_integer _Up> +template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up> _LIBCPP_HIDE_FROM_ABI constexpr bool cmp_greater_equal(_Tp __t, _Up __u) noexcept { return !std::cmp_less(__t, __u); } -template <__libcpp_integer _Tp, __libcpp_integer _Up> +template <__signed_or_unsigned_integer _Tp, __signed_or_unsigned_integer _Up> _LIBCPP_HIDE_FROM_ABI constexpr bool in_range(_Up __u) noexcept { return std::cmp_less_equal(__u, numeric_limits<_Tp>::max()) && std::cmp_greater_equal(__u, numeric_limits<_Tp>::min()); diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in index 7f625cefed1c2..f5fd970934e9b 100644 --- a/libcxx/include/module.modulemap.in +++ b/libcxx/include/module.modulemap.in @@ -94,6 +94,7 @@ module std_core [system] { module extent { header "__type_traits/extent.h" } module has_unique_object_representation { header "__type_traits/has_unique_object_representation.h" } module has_virtual_destructor { header "__type_traits/has_virtual_destructor.h" } + module integer_traits { header "__type_traits/integer_traits.h" } module integral_constant { header "__type_traits/integral_constant.h" } module invoke { header "__type_traits/invoke.h" } module is_abstract { @@ -284,10 +285,6 @@ module std_core [system] { header "__type_traits/is_scalar.h" export std_core.type_traits.integral_constant } - module is_signed_integer { - header "__type_traits/is_signed_integer.h" - export std_core.type_traits.integral_constant - } module is_signed { header "__type_traits/is_signed.h" export std_core.type_traits.integral_constant @@ -340,10 +337,6 @@ module std_core [system] { header "__type_traits/is_union.h" export std_core.type_traits.integral_constant } - module is_unsigned_integer { - header "__type_traits/is_unsigned_integer.h" - export std_core.type_traits.integral_constant - } module is_unsigned { header "__type_traits/is_unsigned.h" export std_core.type_traits.integral_constant diff --git a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp index 563580b687955..4958a258137a1 100644 --- a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp +++ b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_integer.compile.pass.cpp @@ -11,9 +11,9 @@ // Concept helpers for the internal type traits for the fundamental types. // template -// concept __libcpp_integer; +// concept __signed_or_unsigned_integer; -#include +#include <__type_traits/integer_traits.h> #include "test_macros.h" @@ -24,40 +24,40 @@ enum SomeEnum {}; enum class SomeScopedEnum {}; // Unsigned -static_assert(std::__libcpp_integer); -static_assert(std::__libcpp_integer); -static_assert(std::__libcpp_integer); -static_assert(std::__libcpp_integer); -static_assert(std::__libcpp_integer); -static_assert(std::__libcpp_integer); +static_assert(std::__signed_or_unsigned_integer); +static_assert(std::__signed_or_unsigned_integer); +static_assert(std::__signed_or_unsigned_integer); +static_assert(std::__signed_or_unsigned_integer); +static_assert(std::__signed_or_unsigned_integer); +static_assert(std::__signed_or_unsigned_integer); #if _LIBCPP_HAS_INT128 -static_assert(std::__libcpp_integer<__uint128_t>); +static_assert(std::__signed_or_unsigned_integer<__uint128_t>); #endif // Signed -static_assert(std::__libcpp_integer); -static_assert(std::__libcpp_integer); -static_assert(std::__libcpp_integer); -static_assert(std::__libcpp_integer); -static_assert(std::__libcpp_integer); -static_assert(std::__libcpp_integer); +static_assert(std::__signed_or_unsigned_integer); +static_assert(std::__signed_or_unsigned_integer); +static_assert(std::__signed_or_unsigned_integer); +static_assert(std::__signed_or_unsigned_integer); +static_assert(std::__signed_or_unsigned_integer); +static_assert(std::__signed_or_unsigned_integer); #if _LIBCPP_HAS_INT128 -static_assert(std::__libcpp_integer<__int128_t>); +static_assert(std::__signed_or_unsigned_integer<__int128_t>); #endif // Non-integer -static_assert(!std::__libcpp_integer); -static_assert(!std::__libcpp_integer); +static_assert(!std::__signed_or_unsigned_integer); +static_assert(!std::__signed_or_unsigned_integer); #ifndef TEST_HAS_NO_WIDE_CHARACTERS -static_assert(!std::__libcpp_integer); +static_assert(!std::__signed_or_unsigned_integer); #endif -static_assert(!std::__libcpp_integer); -static_assert(!std::__libcpp_integer); -static_assert(!std::__libcpp_integer); -static_assert(!std::__libcpp_integer); -static_assert(!std::__libcpp_integer); -static_assert(!std::__libcpp_integer); -static_assert(!std::__libcpp_integer); -static_assert(!std::__libcpp_integer); -static_assert(!std::__libcpp_integer); -static_assert(!std::__libcpp_integer); -static_assert(!std::__libcpp_integer); -static_assert(!std::__libcpp_integer); +static_assert(!std::__signed_or_unsigned_integer); +static_assert(!std::__signed_or_unsigned_integer); +static_assert(!std::__signed_or_unsigned_integer); +static_assert(!std::__signed_or_unsigned_integer); +static_assert(!std::__signed_or_unsigned_integer); +static_assert(!std::__signed_or_unsigned_integer); +static_assert(!std::__signed_or_unsigned_integer); +static_assert(!std::__signed_or_unsigned_integer); +static_assert(!std::__signed_or_unsigned_integer); +static_assert(!std::__signed_or_unsigned_integer); +static_assert(!std::__signed_or_unsigned_integer); +static_assert(!std::__signed_or_unsigned_integer); diff --git a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp index d1e21ee96b073..3fa342685770c 100644 --- a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp +++ b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_signed_integer.compile.pass.cpp @@ -11,9 +11,9 @@ // Concept helpers for the internal type traits for the fundamental types. // template -// concept __libcpp_signed_integer; +// concept __signed_integer; -#include +#include <__type_traits/integer_traits.h> #include "test_macros.h" @@ -24,40 +24,40 @@ enum SomeEnum {}; enum class SomeScopedEnum {}; // Unsigned -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); #if _LIBCPP_HAS_INT128 -static_assert(!std::__libcpp_signed_integer<__uint128_t>); +static_assert(!std::__signed_integer<__uint128_t>); #endif // Signed -static_assert(std::__libcpp_signed_integer); -static_assert(std::__libcpp_signed_integer); -static_assert(std::__libcpp_signed_integer); -static_assert(std::__libcpp_signed_integer); -static_assert(std::__libcpp_signed_integer); -static_assert(std::__libcpp_signed_integer); +static_assert(std::__signed_integer); +static_assert(std::__signed_integer); +static_assert(std::__signed_integer); +static_assert(std::__signed_integer); +static_assert(std::__signed_integer); +static_assert(std::__signed_integer); #if _LIBCPP_HAS_INT128 -static_assert(std::__libcpp_signed_integer<__int128_t>); +static_assert(std::__signed_integer<__int128_t>); #endif // Non-integer -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); #ifndef TEST_HAS_NO_WIDE_CHARACTERS -static_assert(!std::__libcpp_signed_integer); +static_assert(!std::__signed_integer); #endif -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); -static_assert(!std::__libcpp_signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); +static_assert(!std::__signed_integer); diff --git a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp index c671f03cbfce4..ff60f32319171 100644 --- a/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp +++ b/libcxx/test/libcxx/concepts/concepts.arithmetic/__libcpp_unsigned_integer.compile.pass.cpp @@ -11,9 +11,9 @@ // Concept helpers for the internal type traits for the fundamental types. // template -// concept __libcpp_unsigned_integer; +// concept __unsigned_integer; -#include +#include <__type_traits/integer_traits.h> #include "test_macros.h" @@ -24,40 +24,40 @@ enum SomeEnum {}; enum class SomeScopedEnum {}; // Unsigned -static_assert(std::__libcpp_unsigned_integer); -static_assert(std::__libcpp_unsigned_integer); -static_assert(std::__libcpp_unsigned_integer); -static_assert(std::__libcpp_unsigned_integer); -static_assert(std::__libcpp_unsigned_integer); -static_assert(std::__libcpp_unsigned_integer); +static_assert(std::__unsigned_integer); +static_assert(std::__unsigned_integer); +static_assert(std::__unsigned_integer); +static_assert(std::__unsigned_integer); +static_assert(std::__unsigned_integer); +static_assert(std::__unsigned_integer); #if _LIBCPP_HAS_INT128 -static_assert(std::__libcpp_unsigned_integer<__uint128_t>); +static_assert(std::__unsigned_integer<__uint128_t>); #endif // Signed -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); #if _LIBCPP_HAS_INT128 -static_assert(!std::__libcpp_unsigned_integer<__int128_t>); +static_assert(!std::__unsigned_integer<__int128_t>); #endif // Non-integer -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); #ifndef TEST_HAS_NO_WIDE_CHARACTERS -static_assert(!std::__libcpp_unsigned_integer); +static_assert(!std::__unsigned_integer); #endif -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); -static_assert(!std::__libcpp_unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); +static_assert(!std::__unsigned_integer); From b10d711362b8634cefcb288d9f1b577f63adb9f7 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 11 Jun 2025 14:33:41 +0200 Subject: [PATCH 071/851] [libc++][NFC] Move __libcpp_is_integral into the else branch (#142556) This makes it clear that `__libcpp_is_integral` is an implementation detail of `is_integral` if we don't have `__is_integral` and not its own utility. --- libcxx/include/__type_traits/is_integral.h | 24 +++++++++++----------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/libcxx/include/__type_traits/is_integral.h b/libcxx/include/__type_traits/is_integral.h index 7f7ac26beb770..5a340965f0384 100644 --- a/libcxx/include/__type_traits/is_integral.h +++ b/libcxx/include/__type_traits/is_integral.h @@ -19,6 +19,18 @@ _LIBCPP_BEGIN_NAMESPACE_STD +#if __has_builtin(__is_integral) + +template +struct _LIBCPP_NO_SPECIALIZATIONS is_integral : _BoolConstant<__is_integral(_Tp)> {}; + +# if _LIBCPP_STD_VER >= 17 +template +_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_integral_v = __is_integral(_Tp); +# endif + +#else + // clang-format off template struct __libcpp_is_integral { enum { value = 0 }; }; template <> struct __libcpp_is_integral { enum { value = 1 }; }; @@ -47,18 +59,6 @@ template <> struct __libcpp_is_integral<__uint128_t> { enum { va #endif // clang-format on -#if __has_builtin(__is_integral) - -template -struct _LIBCPP_NO_SPECIALIZATIONS is_integral : _BoolConstant<__is_integral(_Tp)> {}; - -# if _LIBCPP_STD_VER >= 17 -template -_LIBCPP_NO_SPECIALIZATIONS inline constexpr bool is_integral_v = __is_integral(_Tp); -# endif - -#else - template struct is_integral : public _BoolConstant<__libcpp_is_integral<__remove_cv_t<_Tp> >::value> {}; From 2692c3aa6760f1e4ea015f906926f63ec7dce044 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Wed, 11 Jun 2025 12:39:09 +0000 Subject: [PATCH 072/851] [gn build] Port 3c56437eafee --- llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn index 803247bd7881e..41516d677c45a 100644 --- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn +++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn @@ -1457,6 +1457,7 @@ if (current_toolchain == default_toolchain) { "__type_traits/extent.h", "__type_traits/has_unique_object_representation.h", "__type_traits/has_virtual_destructor.h", + "__type_traits/integer_traits.h", "__type_traits/integral_constant.h", "__type_traits/invoke.h", "__type_traits/is_abstract.h", @@ -1507,7 +1508,6 @@ if (current_toolchain == default_toolchain) { "__type_traits/is_same.h", "__type_traits/is_scalar.h", "__type_traits/is_signed.h", - "__type_traits/is_signed_integer.h", "__type_traits/is_specialization.h", "__type_traits/is_standard_layout.h", "__type_traits/is_swappable.h", @@ -1521,7 +1521,6 @@ if (current_toolchain == default_toolchain) { "__type_traits/is_unbounded_array.h", "__type_traits/is_union.h", "__type_traits/is_unsigned.h", - "__type_traits/is_unsigned_integer.h", "__type_traits/is_valid_expansion.h", "__type_traits/is_void.h", "__type_traits/is_volatile.h", From 3d7aa961ac96f83d2e28f107c6dfa5a6a279b364 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Wed, 11 Jun 2025 13:56:30 +0100 Subject: [PATCH 073/851] [DebugInfo][RemoveDIs] Use autoupgrader to convert old debug-info (#143452) By chance, two things have prevented the autoupgrade path being exercised much so far: * LLParser setting the debug-info mode to "old" on seeing intrinsics, * The test in AutoUpgrade.cpp wanting to upgrade into a "new" debug-info block. In practice, this appears to mean this code path hasn't seen the various invalid inputs that can come its way. This commit does a number of things: * Tolerates the various illegal inputs that can be written with debug-intrinsics, and that must be tolerated until the Verifier runs, * Printing illegal/null DbgRecord fields must succeed, * Verifier errors need to localise the function/block where the error is, * Tests that now see debug records will print debug-record errors, Plus a few new tests for other intrinsic-to-debug-record failures modes I found. There are also two edge cases: * Some of the unit tests switch back and forth between intrinsic and record modes at will; I've deleted coverage and some assertions to tolerate this as intrinsic support is now Gone (TM), * In sroa-extract-bits.ll, the order of debug records flips. This is because the autoupgrader upgrades in the opposite order to the basic block conversion routines... which doesn't change the record order, but _does_ change the use list order in Metadata! This should (TM) have no consequence to the correctness of LLVM, but will change the order of various records and the order of DWARF record output too. I tried to reduce this patch to a smaller collection of changes, but they're all intertwined, sorry. --- llvm/lib/AsmParser/LLParser.cpp | 2 - llvm/lib/IR/AsmWriter.cpp | 39 +++++++--- llvm/lib/IR/AutoUpgrade.cpp | 77 +++++++++++++------ llvm/lib/IR/BasicBlock.cpp | 4 - llvm/lib/IR/Verifier.cpp | 29 +++---- .../drop-debug-info-nonzero-alloca.ll | 6 +- .../parse-and-verify/verify.ll | 18 ++--- .../DebugInfo/Generic/sroa-extract-bits.ll | 28 +++---- .../IROutliner/outlining-debug-statements.ll | 3 +- llvm/test/Transforms/ObjCARC/code-motion.ll | 13 ++-- .../RemoveDI/invalid-dbg-declare-operands.ll | 46 +++++++++++ .../Verifier/dbg-declare-invalid-debug-loc.ll | 42 ++++++++++ .../diexpression-entry-value-llvm-ir.ll | 6 +- .../test/Verifier/llvm.dbg.declare-address.ll | 4 +- .../Verifier/llvm.dbg.declare-expression.ll | 5 +- .../Verifier/llvm.dbg.declare-variable.ll | 11 ++- .../llvm.dbg.intrinsic-dbg-attachment.ll | 16 ++-- .../Verifier/llvm.dbg.value-expression.ll | 5 +- llvm/test/Verifier/llvm.dbg.value-value.ll | 4 +- llvm/test/Verifier/llvm.dbg.value-variable.ll | 5 +- llvm/unittests/IR/DebugInfoTest.cpp | 13 ---- 21 files changed, 249 insertions(+), 127 deletions(-) create mode 100644 llvm/test/Verifier/RemoveDI/invalid-dbg-declare-operands.ll create mode 100644 llvm/test/Verifier/dbg-declare-invalid-debug-loc.ll diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp index b933d240c4d27..5c007dcf00224 100644 --- a/llvm/lib/AsmParser/LLParser.cpp +++ b/llvm/lib/AsmParser/LLParser.cpp @@ -8336,8 +8336,6 @@ bool LLParser::parseCall(Instruction *&Inst, PerFunctionState &PFS, return error(CallLoc, "llvm.dbg intrinsic should not appear in a module " "using non-intrinsic debug info"); } - if (!SeenOldDbgInfoFormat) - M->setNewDbgInfoFormatFlag(false); SeenOldDbgInfoFormat = true; } CI->setAttributes(PAL); diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp index 7223dd845d18d..7828ba45ec27f 100644 --- a/llvm/lib/IR/AsmWriter.cpp +++ b/llvm/lib/IR/AsmWriter.cpp @@ -1204,17 +1204,23 @@ void SlotTracker::processFunctionMetadata(const Function &F) { } void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) { + // Tolerate null metadata pointers: it's a completely illegal debug record, + // but we can have faulty metadata from debug-intrinsic days being + // autoupgraded into debug records. This gets caught by the verifier, which + // then will print the faulty IR, hitting this code path. if (const DbgVariableRecord *DVR = dyn_cast(&DR)) { // Process metadata used by DbgRecords; we only specifically care about the // DILocalVariable, DILocation, and DIAssignID fields, as the Value and // Expression fields should only be printed inline and so do not use a slot. // Note: The above doesn't apply for empty-metadata operands. - if (auto *Empty = dyn_cast(DVR->getRawLocation())) + if (auto *Empty = dyn_cast_if_present(DVR->getRawLocation())) CreateMetadataSlot(Empty); - CreateMetadataSlot(DVR->getRawVariable()); + if (DVR->getRawVariable()) + CreateMetadataSlot(DVR->getRawVariable()); if (DVR->isDbgAssign()) { - CreateMetadataSlot(cast(DVR->getRawAssignID())); - if (auto *Empty = dyn_cast(DVR->getRawAddress())) + if (auto *AssignID = DVR->getRawAssignID()) + CreateMetadataSlot(cast(AssignID)); + if (auto *Empty = dyn_cast_if_present(DVR->getRawAddress())) CreateMetadataSlot(Empty); } } else if (const DbgLabelRecord *DLR = dyn_cast(&DR)) { @@ -1222,7 +1228,8 @@ void SlotTracker::processDbgRecordMetadata(const DbgRecord &DR) { } else { llvm_unreachable("unsupported DbgRecord kind"); } - CreateMetadataSlot(DR.getDebugLoc().getAsMDNode()); + if (DR.getDebugLoc()) + CreateMetadataSlot(DR.getDebugLoc().getAsMDNode()); } void SlotTracker::processInstructionMetadata(const Instruction &I) { @@ -4867,22 +4874,30 @@ void AssemblyWriter::printDbgVariableRecord(const DbgVariableRecord &DVR) { llvm_unreachable( "Tried to print a DbgVariableRecord with an invalid LocationType!"); } + + auto PrintOrNull = [&](Metadata *M) { + if (!M) + Out << "(null)"; + else + WriteAsOperandInternal(Out, M, WriterCtx, true); + }; + Out << "("; - WriteAsOperandInternal(Out, DVR.getRawLocation(), WriterCtx, true); + PrintOrNull(DVR.getRawLocation()); Out << ", "; - WriteAsOperandInternal(Out, DVR.getRawVariable(), WriterCtx, true); + PrintOrNull(DVR.getRawVariable()); Out << ", "; - WriteAsOperandInternal(Out, DVR.getRawExpression(), WriterCtx, true); + PrintOrNull(DVR.getRawExpression()); Out << ", "; if (DVR.isDbgAssign()) { - WriteAsOperandInternal(Out, DVR.getRawAssignID(), WriterCtx, true); + PrintOrNull(DVR.getRawAssignID()); Out << ", "; - WriteAsOperandInternal(Out, DVR.getRawAddress(), WriterCtx, true); + PrintOrNull(DVR.getRawAddress()); Out << ", "; - WriteAsOperandInternal(Out, DVR.getRawAddressExpression(), WriterCtx, true); + PrintOrNull(DVR.getRawAddressExpression()); Out << ", "; } - WriteAsOperandInternal(Out, DVR.getDebugLoc().getAsMDNode(), WriterCtx, true); + PrintOrNull(DVR.getDebugLoc().getAsMDNode()); Out << ")"; } diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index 814c00c669cb3..cb90af36f3d9f 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -1155,8 +1155,7 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, case 'd': if (Name.consume_front("dbg.")) { // Mark debug intrinsics for upgrade to new debug format. - if (CanUpgradeDebugIntrinsicsToRecords && - F->getParent()->IsNewDbgInfoFormat) { + if (CanUpgradeDebugIntrinsicsToRecords) { if (Name == "addr" || Name == "value" || Name == "assign" || Name == "declare" || Name == "label") { // There's no function to replace these with. @@ -4395,39 +4394,66 @@ static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI, return Builder.CreateBitCast(RMW, RetTy); } -/// Helper to unwrap intrinsic call MetadataAsValue operands. -template -static MDType *unwrapMAVOp(CallBase *CI, unsigned Op) { - if (MetadataAsValue *MAV = dyn_cast(CI->getArgOperand(Op))) - return dyn_cast(MAV->getMetadata()); +/// Helper to unwrap intrinsic call MetadataAsValue operands. Return as a +/// plain MDNode, as it's the verifier's job to check these are the correct +/// types later. +static MDNode *unwrapMAVOp(CallBase *CI, unsigned Op) { + if (Op < CI->arg_size()) { + if (MetadataAsValue *MAV = + dyn_cast(CI->getArgOperand(Op))) { + Metadata *MD = MAV->getMetadata(); + return dyn_cast_if_present(MD); + } + } + return nullptr; +} + +/// Helper to unwrap Metadata MetadataAsValue operands, such as the Value field. +static Metadata *unwrapMAVMetadataOp(CallBase *CI, unsigned Op) { + if (Op < CI->arg_size()) + if (MetadataAsValue *MAV = dyn_cast(CI->getArgOperand(Op))) + return MAV->getMetadata(); return nullptr; } +static MDNode *getDebugLocSafe(const Instruction *I) { + // The MDNode attached to this instruction might not be the correct type, + // as the verifier has not yet be run. Fetch it as a bare MDNode. + return I->getDebugLoc().getAsMDNode(); +} + /// Convert debug intrinsic calls to non-instruction debug records. /// \p Name - Final part of the intrinsic name, e.g. 'value' in llvm.dbg.value. /// \p CI - The debug intrinsic call. static void upgradeDbgIntrinsicToDbgRecord(StringRef Name, CallBase *CI) { DbgRecord *DR = nullptr; if (Name == "label") { - DR = new DbgLabelRecord(unwrapMAVOp(CI, 0), CI->getDebugLoc()); + DR = DbgLabelRecord::createUnresolvedDbgLabelRecord(unwrapMAVOp(CI, 0), + CI->getDebugLoc()); } else if (Name == "assign") { - DR = new DbgVariableRecord( - unwrapMAVOp(CI, 0), unwrapMAVOp(CI, 1), - unwrapMAVOp(CI, 2), unwrapMAVOp(CI, 3), - unwrapMAVOp(CI, 4), unwrapMAVOp(CI, 5), - CI->getDebugLoc()); + DR = DbgVariableRecord::createUnresolvedDbgVariableRecord( + DbgVariableRecord::LocationType::Assign, unwrapMAVMetadataOp(CI, 0), + unwrapMAVOp(CI, 1), unwrapMAVOp(CI, 2), unwrapMAVOp(CI, 3), + unwrapMAVMetadataOp(CI, 4), + /*The address is a Value ref, it will be stored as a Metadata */ + unwrapMAVOp(CI, 5), getDebugLocSafe(CI)); } else if (Name == "declare") { - DR = new DbgVariableRecord( - unwrapMAVOp(CI, 0), unwrapMAVOp(CI, 1), - unwrapMAVOp(CI, 2), CI->getDebugLoc(), - DbgVariableRecord::LocationType::Declare); + DR = DbgVariableRecord::createUnresolvedDbgVariableRecord( + DbgVariableRecord::LocationType::Declare, unwrapMAVMetadataOp(CI, 0), + unwrapMAVOp(CI, 1), unwrapMAVOp(CI, 2), nullptr, nullptr, nullptr, + getDebugLocSafe(CI)); } else if (Name == "addr") { // Upgrade dbg.addr to dbg.value with DW_OP_deref. - DIExpression *Expr = unwrapMAVOp(CI, 2); - Expr = DIExpression::append(Expr, dwarf::DW_OP_deref); - DR = new DbgVariableRecord(unwrapMAVOp(CI, 0), - unwrapMAVOp(CI, 1), Expr, - CI->getDebugLoc()); + MDNode *ExprNode = unwrapMAVOp(CI, 2); + // Don't try to add something to the expression if it's not an expression. + // Instead, allow the verifier to fail later. + if (DIExpression *Expr = dyn_cast(ExprNode)) { + ExprNode = DIExpression::append(Expr, dwarf::DW_OP_deref); + } + DR = DbgVariableRecord::createUnresolvedDbgVariableRecord( + DbgVariableRecord::LocationType::Value, unwrapMAVMetadataOp(CI, 0), + unwrapMAVOp(CI, 1), ExprNode, nullptr, nullptr, nullptr, + getDebugLocSafe(CI)); } else if (Name == "value") { // An old version of dbg.value had an extra offset argument. unsigned VarOp = 1; @@ -4440,9 +4466,10 @@ static void upgradeDbgIntrinsicToDbgRecord(StringRef Name, CallBase *CI) { VarOp = 2; ExprOp = 3; } - DR = new DbgVariableRecord( - unwrapMAVOp(CI, 0), unwrapMAVOp(CI, VarOp), - unwrapMAVOp(CI, ExprOp), CI->getDebugLoc()); + DR = DbgVariableRecord::createUnresolvedDbgVariableRecord( + DbgVariableRecord::LocationType::Value, unwrapMAVMetadataOp(CI, 0), + unwrapMAVOp(CI, VarOp), unwrapMAVOp(CI, ExprOp), nullptr, nullptr, + nullptr, getDebugLocSafe(CI)); } assert(DR && "Unhandled intrinsic kind in upgrade to DbgRecord"); CI->getParent()->insertDbgRecordBefore(DR, CI->getIterator()); diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp index ed11ea06398f1..f716e9970b841 100644 --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -32,8 +32,6 @@ using namespace llvm; STATISTIC(NumInstrRenumberings, "Number of renumberings across all blocks"); DbgMarker *BasicBlock::createMarker(Instruction *I) { - assert(IsNewDbgInfoFormat && - "Tried to create a marker in a non new debug-info block!"); if (I->DebugMarker) return I->DebugMarker; DbgMarker *Marker = new DbgMarker(); @@ -43,8 +41,6 @@ DbgMarker *BasicBlock::createMarker(Instruction *I) { } DbgMarker *BasicBlock::createMarker(InstListType::iterator It) { - assert(IsNewDbgInfoFormat && - "Tried to create a marker in a non new debug-info block!"); if (It != end()) return createMarker(&*It); DbgMarker *DM = getTrailingDbgRecords(); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp index 592bb6aa90613..9ec94a8b80959 100644 --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -6714,7 +6714,7 @@ void Verifier::visit(DbgVariableRecord &DVR) { CheckDI(DVR.getType() == DbgVariableRecord::LocationType::Value || DVR.getType() == DbgVariableRecord::LocationType::Declare || DVR.getType() == DbgVariableRecord::LocationType::Assign, - "invalid #dbg record type", &DVR, DVR.getType()); + "invalid #dbg record type", &DVR, DVR.getType(), BB, F); // The location for a DbgVariableRecord must be either a ValueAsMetadata, // DIArgList, or an empty MDNode (which is a legacy representation for an @@ -6722,30 +6722,33 @@ void Verifier::visit(DbgVariableRecord &DVR) { auto *MD = DVR.getRawLocation(); CheckDI(MD && (isa(MD) || isa(MD) || (isa(MD) && !cast(MD)->getNumOperands())), - "invalid #dbg record address/value", &DVR, MD); + "invalid #dbg record address/value", &DVR, MD, BB, F); if (auto *VAM = dyn_cast(MD)) { visitValueAsMetadata(*VAM, F); if (DVR.isDbgDeclare()) { // Allow integers here to support inttoptr salvage. Type *Ty = VAM->getValue()->getType(); CheckDI(Ty->isPointerTy() || Ty->isIntegerTy(), - "location of #dbg_declare must be a pointer or int", &DVR, MD); + "location of #dbg_declare must be a pointer or int", &DVR, MD, BB, + F); } } else if (auto *AL = dyn_cast(MD)) { visitDIArgList(*AL, F); } CheckDI(isa_and_nonnull(DVR.getRawVariable()), - "invalid #dbg record variable", &DVR, DVR.getRawVariable()); + "invalid #dbg record variable", &DVR, DVR.getRawVariable(), BB, F); visitMDNode(*DVR.getRawVariable(), AreDebugLocsAllowed::No); CheckDI(isa_and_nonnull(DVR.getRawExpression()), - "invalid #dbg record expression", &DVR, DVR.getRawExpression()); + "invalid #dbg record expression", &DVR, DVR.getRawExpression(), BB, + F); visitMDNode(*DVR.getExpression(), AreDebugLocsAllowed::No); if (DVR.isDbgAssign()) { CheckDI(isa_and_nonnull(DVR.getRawAssignID()), - "invalid #dbg_assign DIAssignID", &DVR, DVR.getRawAssignID()); + "invalid #dbg_assign DIAssignID", &DVR, DVR.getRawAssignID(), BB, + F); visitMDNode(*cast(DVR.getRawAssignID()), AreDebugLocsAllowed::No); @@ -6756,29 +6759,29 @@ void Verifier::visit(DbgVariableRecord &DVR) { CheckDI( isa(RawAddr) || (isa(RawAddr) && !cast(RawAddr)->getNumOperands()), - "invalid #dbg_assign address", &DVR, DVR.getRawAddress()); + "invalid #dbg_assign address", &DVR, DVR.getRawAddress(), BB, F); if (auto *VAM = dyn_cast(RawAddr)) visitValueAsMetadata(*VAM, F); CheckDI(isa_and_nonnull(DVR.getRawAddressExpression()), "invalid #dbg_assign address expression", &DVR, - DVR.getRawAddressExpression()); + DVR.getRawAddressExpression(), BB, F); visitMDNode(*DVR.getAddressExpression(), AreDebugLocsAllowed::No); // All of the linked instructions should be in the same function as DVR. for (Instruction *I : at::getAssignmentInsts(&DVR)) CheckDI(DVR.getFunction() == I->getFunction(), - "inst not in same function as #dbg_assign", I, &DVR); + "inst not in same function as #dbg_assign", I, &DVR, BB, F); } // This check is redundant with one in visitLocalVariable(). DILocalVariable *Var = DVR.getVariable(); - CheckDI(isType(Var->getRawType()), "invalid type ref", Var, - Var->getRawType()); + CheckDI(isType(Var->getRawType()), "invalid type ref", Var, Var->getRawType(), + BB, F); auto *DLNode = DVR.getDebugLoc().getAsMDNode(); CheckDI(isa_and_nonnull(DLNode), "invalid #dbg record DILocation", - &DVR, DLNode); + &DVR, DLNode, BB, F); DILocation *Loc = DVR.getDebugLoc(); // The scopes for variables and !dbg attachments must agree. @@ -6790,7 +6793,7 @@ void Verifier::visit(DbgVariableRecord &DVR) { CheckDI(VarSP == LocSP, "mismatched subprogram between #dbg record variable and DILocation", &DVR, BB, F, Var, Var->getScope()->getSubprogram(), Loc, - Loc->getScope()->getSubprogram()); + Loc->getScope()->getSubprogram(), BB, F); verifyFnArgs(DVR); } diff --git a/llvm/test/Assembler/drop-debug-info-nonzero-alloca.ll b/llvm/test/Assembler/drop-debug-info-nonzero-alloca.ll index 2b089d2639375..c8b235757afba 100644 --- a/llvm/test/Assembler/drop-debug-info-nonzero-alloca.ll +++ b/llvm/test/Assembler/drop-debug-info-nonzero-alloca.ll @@ -12,8 +12,12 @@ entry: metadata ptr undef, metadata !DILocalVariable(scope: !1), metadata !DIExpression()) -; AS: llvm.dbg.value intrinsic requires a !dbg attachment +; AS: invalid #dbg record DILocation +; AS: #dbg_value(ptr undef, !{{[0-9]+}}, !DIExpression(), (null)) +; AS: label %entry +; AS: ptr @foo ; AS: warning: ignoring invalid debug info in + ret void } diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/verify.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/verify.ll index 0a4b7c255dc71..d1f1e1ce768dc 100644 --- a/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/verify.ll +++ b/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/verify.ll @@ -8,7 +8,7 @@ define dso_local void @fun2() !dbg !15 { ;; DIAssignID copied here from @fun() where it is used by intrinsics. - ; CHECK: dbg.assign not in same function as inst + ; CHECK: DVRAssign not in same function as inst %x = alloca i32, align 4, !DIAssignID !14 ret void } @@ -17,24 +17,24 @@ define dso_local void @fun() !dbg !7 { entry: %a = alloca i32, align 4, !DIAssignID !14 ;; Here something other than a dbg.assign intrinsic is using a DIAssignID. - ; CHECK: !DIAssignID should only be used by llvm.dbg.assign intrinsics + ; CHECK: !DIAssignID should only be used by Assign DVRs call void @llvm.dbg.value(metadata !14, metadata !10, metadata !DIExpression()), !dbg !13 ;; Each following dbg.assign has an argument of the incorrect type. - ; CHECK: invalid llvm.dbg.assign intrinsic address/value + ; CHECK: invalid #dbg record address/value call void @llvm.dbg.assign(metadata !3, metadata !10, metadata !DIExpression(), metadata !14, metadata ptr undef, metadata !DIExpression()), !dbg !13 - ; CHECK: invalid llvm.dbg.assign intrinsic variable + ; CHECK: invalid #dbg record variable call void @llvm.dbg.assign(metadata i32 0, metadata !2, metadata !DIExpression(), metadata !14, metadata ptr undef, metadata !DIExpression()), !dbg !13 - ; CHECK: invalid llvm.dbg.assign intrinsic expression + ; CHECK: invalid #dbg record expression call void @llvm.dbg.assign(metadata !14, metadata !10, metadata !2, metadata !14, metadata ptr undef, metadata !DIExpression()), !dbg !13 - ; CHECK: invalid llvm.dbg.assign intrinsic DIAssignID + ; CHECK: invalid #dbg_assign DIAssignID call void @llvm.dbg.assign(metadata !14, metadata !10, metadata !DIExpression(), metadata !2, metadata ptr undef, metadata !DIExpression()), !dbg !13 - ; CHECK: invalid llvm.dbg.assign intrinsic address + ; CHECK: invalid #dbg_assign address call void @llvm.dbg.assign(metadata !14, metadata !10, metadata !DIExpression(), metadata !14, metadata !3, metadata !DIExpression()), !dbg !13 ;; Empty metadata debug operands are allowed. - ; CHECK-NOT: invalid llvm.dbg.assign + ; CHECK-NOT: invalid #dbg record call void @llvm.dbg.assign(metadata !14, metadata !10, metadata !DIExpression(), metadata !14, metadata !2, metadata !DIExpression()), !dbg !13 - ; CHECK: invalid llvm.dbg.assign intrinsic address expression + ; CHECK: invalid #dbg_assign address expression call void @llvm.dbg.assign(metadata !14, metadata !10, metadata !DIExpression(), metadata !14, metadata ptr undef, metadata !2), !dbg !13 ret void } diff --git a/llvm/test/DebugInfo/Generic/sroa-extract-bits.ll b/llvm/test/DebugInfo/Generic/sroa-extract-bits.ll index f47e495db6617..6db453605cb57 100644 --- a/llvm/test/DebugInfo/Generic/sroa-extract-bits.ll +++ b/llvm/test/DebugInfo/Generic/sroa-extract-bits.ll @@ -13,8 +13,8 @@ define i8 @test1(i32 %arg) { ; CHECK-NEXT: #dbg_value(i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]], [[META2:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7:![0-9]+]]) ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG]], 8 ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i24 -; CHECK-NEXT: #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 16), [[META7]]) -; CHECK-NEXT: #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META9:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]]) +; CHECK-NEXT: #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]]) +; CHECK-NEXT: #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META10:![0-9]+]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 16), [[META7]]) ; CHECK-NEXT: ret i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]] ; entry: @@ -36,11 +36,11 @@ define i8 @test2(i32 %arg1, i8 %arg2) { ; CHECK-NEXT: #dbg_value(i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]], [[META2]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]]) ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG1]], 8 ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i16 -; CHECK-NEXT: #dbg_value(i16 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META9]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 16), [[META7]]) +; CHECK-NEXT: #dbg_value(i16 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 16), [[META7]]) ; CHECK-NEXT: [[PTR_SROA_21_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG1]], 24 ; CHECK-NEXT: [[PTR_SROA_21_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_21_0_EXTRACT_SHIFT]] to i8 -; CHECK-NEXT: #dbg_value(i8 [[PTR_SROA_21_0_EXTRACT_TRUNC]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]]) -; CHECK-NEXT: #dbg_value(i8 [[ARG2]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]]) +; CHECK-NEXT: #dbg_value(i8 [[PTR_SROA_21_0_EXTRACT_TRUNC]], [[META10]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]]) +; CHECK-NEXT: #dbg_value(i8 [[ARG2]], [[META10]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]]) ; CHECK-NEXT: ret i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]] ; entry: @@ -84,7 +84,7 @@ define i16 @test4(i32 %arg) { ; CHECK-NEXT: #dbg_value(i16 [[PTR_SROA_0_0_EXTRACT_TRUNC]], [[META2]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 8), [[META7]]) ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG]], 16 ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i16 -; CHECK-NEXT: #dbg_value(i16 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8), [[META7]]) +; CHECK-NEXT: #dbg_value(i16 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META10]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8), [[META7]]) ; CHECK-NEXT: ret i16 [[PTR_SROA_0_0_EXTRACT_TRUNC]] ; entry: @@ -107,8 +107,8 @@ define i8 @test5(i32 %arg) { ; CHECK-NEXT: #dbg_value(i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]], [[META11:![0-9]+]], !DIExpression(), [[META7]]) ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG]], 8 ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i24 -; CHECK-NEXT: #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8), [[META7]]) -; CHECK-NEXT: #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META9]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]]) +; CHECK-NEXT: #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]]) +; CHECK-NEXT: #dbg_value(i24 [[PTR_SROA_2_0_EXTRACT_TRUNC]], [[META10]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 8, 8), [[META7]]) ; CHECK-NEXT: ret i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]] ; entry: @@ -130,11 +130,11 @@ define i8 @test6(i32 %arg1, i8 %arg2) { ; CHECK-NEXT: #dbg_value(i8 poison, [[META2]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]]) ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG1]], 8 ; CHECK-NEXT: [[PTR_SROA_2_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_2_0_EXTRACT_SHIFT]] to i16 -; CHECK-NEXT: #dbg_value(i16 poison, [[META9]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 16), [[META7]]) +; CHECK-NEXT: #dbg_value(i16 poison, [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_zext, 0, 16), [[META7]]) ; CHECK-NEXT: [[PTR_SROA_21_0_EXTRACT_SHIFT:%.*]] = lshr i32 [[ARG1]], 24 ; CHECK-NEXT: [[PTR_SROA_21_0_EXTRACT_TRUNC:%.*]] = trunc i32 [[PTR_SROA_21_0_EXTRACT_SHIFT]] to i8 -; CHECK-NEXT: #dbg_value(i8 poison, [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]]) -; CHECK-NEXT: #dbg_value(i8 poison, [[META8]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]]) +; CHECK-NEXT: #dbg_value(i8 poison, [[META10]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]]) +; CHECK-NEXT: #dbg_value(i8 poison, [[META10]], !DIExpression(DW_OP_LLVM_extract_bits_sext, 0, 8), [[META7]]) ; CHECK-NEXT: ret i8 [[PTR_SROA_0_0_EXTRACT_TRUNC]] ; entry: @@ -197,9 +197,9 @@ entry: ; CHECK: [[META5]] = !DIFile(filename: "dbg-bit-piece.cpp", directory: "") ; CHECK: [[META6]] = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) ; CHECK: [[META7]] = !DILocation(line: 0, scope: [[META3]]) -; CHECK: [[META8]] = !DILocalVariable(name: "z", scope: [[META3]], type: [[META6]]) -; CHECK: [[META9]] = !DILocalVariable(name: "y", scope: [[META3]], type: [[META10:![0-9]+]]) -; CHECK: [[META10]] = !DIBasicType(name: "signed int", size: 32, encoding: DW_ATE_signed) +; CHECK: [[META8]] = !DILocalVariable(name: "y", scope: [[META3]], type: [[META9:![0-9]+]]) +; CHECK: [[META9]] = !DIBasicType(name: "signed int", size: 32, encoding: DW_ATE_signed) +; CHECK: [[META10]] = !DILocalVariable(name: "z", scope: [[META3]], type: [[META6]]) ; CHECK: [[META11]] = !DILocalVariable(name: "x", scope: [[META3]], type: [[META12:![0-9]+]]) ; CHECK: [[META12]] = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char) ;. diff --git a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll index bf846c310a525..c1140988fa916 100644 --- a/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll +++ b/llvm/test/Transforms/IROutliner/outlining-debug-statements.ll @@ -19,7 +19,7 @@ entry: %c = alloca i32, align 4 store i32 2, ptr %a, align 4 store i32 3, ptr %b, align 4 - call void @llvm.dbg.value(metadata i64 0, metadata !14, metadata !DIExpression()), !dbg !14 + call void @llvm.dbg.value(metadata i64 0, metadata !14, metadata !DIExpression()), !dbg !15 store i32 4, ptr %c, align 4 %al = load i32, ptr %a %bl = load i32, ptr %b @@ -62,3 +62,4 @@ entry: !12 = !DISubroutineType(types: !13) !13 = !{} !14 = !DILocalVariable(name: "p_6", arg: 1, scope: !11, line: 117, type: !1) +!15 = !DILocation(line: 1, scope: !11) diff --git a/llvm/test/Transforms/ObjCARC/code-motion.ll b/llvm/test/Transforms/ObjCARC/code-motion.ll index 499ee77bc6541..9009b98b4b1e3 100644 --- a/llvm/test/Transforms/ObjCARC/code-motion.ll +++ b/llvm/test/Transforms/ObjCARC/code-motion.ll @@ -1,4 +1,4 @@ -; RUN: opt -passes=objc-arc -S < %s | FileCheck %s +; RUN: opt -passes=objc-arc -S < %s 2>&1 | FileCheck %s '--implicit-check-not=ignoring invalid debug' declare void @alterRefCount() declare void @use(ptr) @@ -17,7 +17,7 @@ define i32 @test(ptr %x, ptr %y, i8 %z, i32 %i) { store i32 %i, ptr %i.addr, align 4 %v1 = tail call ptr @llvm.objc.retain(ptr %x) store i8 %z, ptr %x - call void @llvm.dbg.declare(metadata ptr %i.addr, metadata !9, metadata !DIExpression()), !dbg !10 + call void @llvm.dbg.declare(metadata ptr %i.addr, metadata !11, metadata !DIExpression()), !dbg !10 call void @alterRefCount() tail call void @llvm.objc.release(ptr %x) ret i32 %i @@ -64,7 +64,7 @@ define void @test3(ptr %obj, i1 %cond) { ; CHECK-NEXT: call void @use(ptr [[OBJ]]) ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: -; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ]]) {{.*}}, !clang.imprecise_release !2 +; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ]]) {{.*}}, !clang.imprecise_release ![[EMPTYMETA:[0-9]+]] ; CHECK-NEXT: ret void ; %v0 = call ptr @llvm.objc.retain(ptr %obj) @@ -102,8 +102,8 @@ define void @test4(ptr %obj0, ptr %obj1, i1 %cond) { ; CHECK-NEXT: call void @use(ptr [[OBJ1]]) ; CHECK-NEXT: br label [[JOIN]] ; CHECK: join: -; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ0]]) {{.*}}, !clang.imprecise_release !2 -; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ1]]) {{.*}}, !clang.imprecise_release !2 +; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ0]]) {{.*}}, !clang.imprecise_release ![[EMPTYMETA]] +; CHECK-NEXT: call void @llvm.objc.release(ptr [[OBJ1]]) {{.*}}, !clang.imprecise_release ![[EMPTYMETA]] ; CHECK-NEXT: ret void ; %v0 = call ptr @llvm.objc.retain(ptr %obj0) @@ -190,6 +190,8 @@ attributes #0 = { readonly } !llvm.module.flags = !{!0, !1} +; CHECK: ![[EMPTYMETA]] = !{} + !0 = !{i32 2, !"Dwarf Version", i32 4} !1 = !{i32 2, !"Debug Info Version", i32 3} !2 = !DILocalVariable(name: "i", arg: 1, scope: !3, file: !4, line: 1, type: !7) @@ -201,3 +203,4 @@ attributes #0 = { readonly } !8 = distinct !DICompileUnit(language: DW_LANG_ObjC, file: !4, isOptimized: false, runtimeVersion: 2, emissionKind: FullDebug, enums: !9, nameTableKind: None) !9 = !{} !10 = !DILocation(line: 1, column: 14, scope: !3) +!11 = !DILocalVariable(name: "foo", scope: !3, type: !7) diff --git a/llvm/test/Verifier/RemoveDI/invalid-dbg-declare-operands.ll b/llvm/test/Verifier/RemoveDI/invalid-dbg-declare-operands.ll new file mode 100644 index 0000000000000..cdc9d8df82aa7 --- /dev/null +++ b/llvm/test/Verifier/RemoveDI/invalid-dbg-declare-operands.ll @@ -0,0 +1,46 @@ +; RUN: llvm-as %s -o - 2>&1 | FileCheck %s +; CHECK: invalid #dbg record expression +; +; Fossilised debug-info with only two arguments to dbg.declare have been +; spotted in LLVMs test suite (debug-info-always-inline.ll), test that this +; does not cause a crash. LLVM needs to be able to autoupgrade invalid +; dbg.declares to invalid #dbg_declares because this occurs before the +; Verifier runs. + +; ModuleID = 'out.ll' +source_filename = "llvm/test/DebugInfo/Generic/debug-info-always-inline.ll" +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +declare void @llvm.dbg.declare(metadata, metadata, metadata) + +; Function Attrs: alwaysinline nounwind sspstrong +define i32 @_Z3foov() !dbg !7 { +entry: + %sum = alloca i32, align 4, !dbg !11 + call void @llvm.dbg.declare(metadata ptr %sum, metadata !26), !dbg !11 + ret i32 0, !dbg !15 +} + +declare void @_Z3barv() + +!llvm.module.flags = !{!0, !1} +!llvm.ident = !{!2} +!llvm.dbg.cu = !{!3} +!llvm.debugify = !{!5, !6} + +!0 = !{i32 2, !"Dwarf Version", i32 4} +!1 = !{i32 2, !"Debug Info Version", i32 3} +!2 = !{!"clang version 3.6.0 (217844)"} +!3 = distinct !DICompileUnit(language: DW_LANG_C, file: !4, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug) +!4 = !DIFile(filename: "/fast/fs/llvm-main/llvm/test/DebugInfo/Generic/debug-info-always-inline.ll", directory: "/") +!5 = !{i32 14} +!6 = !{i32 7} +!7 = distinct !DISubprogram(name: "_Z3foov", linkageName: "_Z3foov", scope: null, file: !4, line: 1, type: !8, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !3, retainedNodes: !9) +!8 = !DISubroutineType(types: !9) +!9 = !{} +!11 = !DILocation(line: 2, column: 1, scope: !7) +!15 = !DILocation(line: 6, column: 1, scope: !7) +!25 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed) +!26 = !DILocalVariable(name: "b", scope: !7, file: !4, line: 1234, type: !25) + diff --git a/llvm/test/Verifier/dbg-declare-invalid-debug-loc.ll b/llvm/test/Verifier/dbg-declare-invalid-debug-loc.ll new file mode 100644 index 0000000000000..c521a9b8eb11b --- /dev/null +++ b/llvm/test/Verifier/dbg-declare-invalid-debug-loc.ll @@ -0,0 +1,42 @@ +; RUN: opt %s -o /dev/null -S 2>&1 | FileCheck %s +; +; The last dbg.declare intrinsic in this file has an illegal DILocation -- this +; needs to pass through the autoupgrade to #dbg_declare process and then get +; caught by the verifier. +; +; CHECK: invalid #dbg record DILocation +; CHECK-NEXT: #dbg_declare(ptr %1, ![[VAR:[0-9]+]], !DIExpression(), ![[PROG:[0-9]+]]) +; CHECK-NEXT: ![[PROG]] = distinct !DISubprogram(name: "IgnoreIntrinsicTest", +; CHECK-NEXT: label %0 +; CHECK-NEXT: ptr @IgnoreIntrinsicTest + +declare void @llvm.dbg.declare(metadata, metadata, metadata) + +define i32 @IgnoreIntrinsicTest() !dbg !10 { + %1 = alloca i32, align 4 + call void @llvm.dbg.declare(metadata ptr %1, metadata !14, metadata !DIExpression()), !dbg !10 + store volatile i32 1, ptr %1, align 4 + %2 = load volatile i32, ptr %1, align 4 + %3 = mul nsw i32 %2, 42 + ret i32 %3 +} + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!8, !9} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 3.4 ", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !5, globals: !5, imports: !5) +!1 = !DIFile(filename: "", directory: "/Users/matt/ryan_bug") +!2 = !{!3} +!3 = !DICompositeType(tag: DW_TAG_enumeration_type, scope: !4, file: !1, line: 20, size: 32, align: 32, elements: !6) +!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "C", file: !1, line: 19, size: 8, align: 8, elements: !5) +!5 = !{} +!6 = !{!7} +!7 = !DIEnumerator(name: "max_frame_size", value: 0) +!8 = !{i32 2, !"Dwarf Version", i32 2} +!9 = !{i32 1, !"Debug Info Version", i32 3} +!10 = distinct !DISubprogram(name: "IgnoreIntrinsicTest", linkageName: "IgnoreIntrinsicTest", scope: !1, file: !1, line: 1, type: !11, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !5) +!11 = !DISubroutineType(types: !12) +!12 = !{!13} +!13 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) +!14 = !DILocalVariable(name: "x", scope: !10, file: !1, line: 2, type: !13) +!15 = !DILocation(line: 2, column: 16, scope: !10) diff --git a/llvm/test/Verifier/diexpression-entry-value-llvm-ir.ll b/llvm/test/Verifier/diexpression-entry-value-llvm-ir.ll index 652e6667bfc5c..1a28f0ec519f7 100644 --- a/llvm/test/Verifier/diexpression-entry-value-llvm-ir.ll +++ b/llvm/test/Verifier/diexpression-entry-value-llvm-ir.ll @@ -1,9 +1,9 @@ ; RUN: llvm-as -disable-output <%s 2>&1| FileCheck %s -; CHECK-NOT: llvm.dbg.value +; CHECK-NOT: #dbg_value ; CHECK: Entry values are only allowed in MIR unless they target a swiftasync Argument -; CHECK: call void @llvm.dbg.value(metadata i32 %param, metadata !{{.*}}, metadata !DIExpression(DW_OP_LLVM_entry_value, 1)) -; CHECK-NOT: llvm.dbg.value +; CHECK: #dbg_value(i32 %param, !{{.*}}, !DIExpression(DW_OP_LLVM_entry_value, 1), +; CHECK-NOT: #dbg_value ; CHECK-NOT: Entry values are only allowed ; CHECK: warning: ignoring invalid debug info diff --git a/llvm/test/Verifier/llvm.dbg.declare-address.ll b/llvm/test/Verifier/llvm.dbg.declare-address.ll index 219f9ca0a6679..251526b4c321b 100644 --- a/llvm/test/Verifier/llvm.dbg.declare-address.ll +++ b/llvm/test/Verifier/llvm.dbg.declare-address.ll @@ -1,6 +1,6 @@ ; RUN: llvm-as -disable-output <%s 2>&1 | FileCheck %s -; CHECK: invalid llvm.dbg.declare intrinsic address/value -; CHECK-NEXT: call void @llvm.dbg.declare({{.*}}) +; CHECK: invalid #dbg record address/value +; CHECK-NEXT: #dbg_declare({{.*}}) ; CHECK-NEXT: !"" ; CHECK: warning: ignoring invalid debug info diff --git a/llvm/test/Verifier/llvm.dbg.declare-expression.ll b/llvm/test/Verifier/llvm.dbg.declare-expression.ll index 671ec21780088..de65bb570677e 100644 --- a/llvm/test/Verifier/llvm.dbg.declare-expression.ll +++ b/llvm/test/Verifier/llvm.dbg.declare-expression.ll @@ -1,7 +1,6 @@ ; RUN: llvm-as -disable-output <%s 2>&1 | FileCheck %s -; CHECK: invalid llvm.dbg.declare intrinsic expression -; CHECK-NEXT: call void @llvm.dbg.declare({{.*}}) -; CHECK-NEXT: !"" +; CHECK: invalid #dbg record expression +; CHECK-NEXT: #dbg_declare({{.*}}) ; CHECK: warning: ignoring invalid debug info define void @foo(i32 %a) { diff --git a/llvm/test/Verifier/llvm.dbg.declare-variable.ll b/llvm/test/Verifier/llvm.dbg.declare-variable.ll index 4f0ae4daa822f..601fab190d36b 100644 --- a/llvm/test/Verifier/llvm.dbg.declare-variable.ll +++ b/llvm/test/Verifier/llvm.dbg.declare-variable.ll @@ -1,13 +1,16 @@ ; RUN: llvm-as -disable-output <%s 2>&1 | FileCheck %s -; CHECK: invalid llvm.dbg.declare intrinsic variable -; CHECK-NEXT: call void @llvm.dbg.declare({{.*}}) -; CHECK-NEXT: !"" +; CHECK: invalid #dbg record variable +; CHECK-NEXT: #dbg_declare({{.*}}) +; CHECK-NEXT: DISubprogram ; CHECK: warning: ignoring invalid debug info +;; This test ensures we report an illegal variable as illegal, but also that +;; the illegal MDNode is printed out (DISubprogram) to help localise. + define void @foo(i32 %a) { entry: %s = alloca i32 - call void @llvm.dbg.declare(metadata ptr %s, metadata !"", metadata !DIExpression()), !dbg !DILocation(scope: !1) + call void @llvm.dbg.declare(metadata ptr %s, metadata !1, metadata !DIExpression()), !dbg !DILocation(scope: !1) ret void } diff --git a/llvm/test/Verifier/llvm.dbg.intrinsic-dbg-attachment.ll b/llvm/test/Verifier/llvm.dbg.intrinsic-dbg-attachment.ll index 5d82f490e055d..b1e22b20d0864 100644 --- a/llvm/test/Verifier/llvm.dbg.intrinsic-dbg-attachment.ll +++ b/llvm/test/Verifier/llvm.dbg.intrinsic-dbg-attachment.ll @@ -5,8 +5,8 @@ entry: metadata ptr undef, metadata !DILocalVariable(scope: !1), metadata !DIExpression()) -; CHECK-LABEL: llvm.dbg.value intrinsic requires a !dbg attachment -; CHECK-NEXT: call void @llvm.dbg.value({{.*}}) +; CHECK-LABEL: invalid #dbg record DILocation +; CHECK-NEXT: #dbg_value({{.*}}) ; CHECK-NEXT: label %entry ; CHECK-NEXT: ptr @foo @@ -14,8 +14,8 @@ entry: metadata ptr undef, metadata !DILocalVariable(scope: !1), metadata !DIExpression()) -; CHECK-LABEL: llvm.dbg.declare intrinsic requires a !dbg attachment -; CHECK-NEXT: call void @llvm.dbg.declare({{.*}}) +; CHECK-LABEL: invalid #dbg record DILocation +; CHECK-NEXT: #dbg_declare({{.*}}) ; CHECK-NEXT: label %entry ; CHECK-NEXT: ptr @foo @@ -24,8 +24,8 @@ entry: metadata !DILocalVariable(scope: !1), metadata !DIExpression()), !dbg !DILocation(scope: !2) -; CHECK-LABEL: mismatched subprogram between llvm.dbg.value variable and !dbg attachment -; CHECK-NEXT: call void @llvm.dbg.value({{[^,]+}}, metadata ![[VAR:[0-9]+]], {{[^,]+}}), !dbg ![[LOC:[0-9]+]] +; CHECK-LABEL: mismatched subprogram between #dbg record variable and DILocation +; CHECK-NEXT: #dbg_value({{[^,]+}}, ![[VAR:[0-9]+]], {{[^,]+}}, ![[LOC:[0-9]+]]) ; CHECK-NEXT: label %entry ; CHECK-NEXT: ptr @foo ; CHECK-NEXT: ![[VAR]] = !DILocalVariable({{.*}}scope: ![[VARSP:[0-9]+]] @@ -38,8 +38,8 @@ entry: metadata !DILocalVariable(scope: !1), metadata !DIExpression()), !dbg !DILocation(scope: !2) -; CHECK-LABEL: mismatched subprogram between llvm.dbg.declare variable and !dbg attachment -; CHECK-NEXT: call void @llvm.dbg.declare({{[^,]+}}, metadata ![[VAR:[0-9]+]], {{.*[^,]+}}), !dbg ![[LOC:[0-9]+]] +; CHECK-LABEL: mismatched subprogram between #dbg record variable and DILocation +; CHECK-NEXT: #dbg_declare({{[^,]+}}, ![[VAR:[0-9]+]], {{.*[^,]+}}, ![[LOC:[0-9]+]]) ; CHECK-NEXT: label %entry ; CHECK-NEXT: ptr @foo ; CHECK-NEXT: ![[VAR]] = !DILocalVariable({{.*}}scope: ![[VARSP:[0-9]+]] diff --git a/llvm/test/Verifier/llvm.dbg.value-expression.ll b/llvm/test/Verifier/llvm.dbg.value-expression.ll index cc45af2e8e7cb..92fd2add700ed 100644 --- a/llvm/test/Verifier/llvm.dbg.value-expression.ll +++ b/llvm/test/Verifier/llvm.dbg.value-expression.ll @@ -1,7 +1,6 @@ ; RUN: llvm-as -disable-output <%s 2>&1 | FileCheck %s -; CHECK: invalid llvm.dbg.value intrinsic expression -; CHECK-NEXT: call void @llvm.dbg.value({{.*}}) -; CHECK-NEXT: !"" +; CHECK: invalid #dbg record expression +; CHECK-NEXT: #dbg_value({{.*}}) ; CHECK: warning: ignoring invalid debug info define void @foo(i32 %a) { diff --git a/llvm/test/Verifier/llvm.dbg.value-value.ll b/llvm/test/Verifier/llvm.dbg.value-value.ll index 8b0ec1fed05c3..c390e530653cd 100644 --- a/llvm/test/Verifier/llvm.dbg.value-value.ll +++ b/llvm/test/Verifier/llvm.dbg.value-value.ll @@ -1,6 +1,6 @@ ; RUN: llvm-as -disable-output <%s 2>&1 | FileCheck %s -; CHECK: invalid llvm.dbg.value intrinsic address/value -; CHECK-NEXT: call void @llvm.dbg.value({{.*}}) +; CHECK: invalid #dbg record address/value +; CHECK-NEXT: #dbg_value({{.*}}) ; CHECK-NEXT: !"" ; CHECK: warning: ignoring invalid debug info diff --git a/llvm/test/Verifier/llvm.dbg.value-variable.ll b/llvm/test/Verifier/llvm.dbg.value-variable.ll index 4388e20797ce7..603a4b5c47e7d 100644 --- a/llvm/test/Verifier/llvm.dbg.value-variable.ll +++ b/llvm/test/Verifier/llvm.dbg.value-variable.ll @@ -1,7 +1,6 @@ ; RUN: llvm-as -disable-output <%s 2>&1 | FileCheck %s -; CHECK: invalid llvm.dbg.value intrinsic variable -; CHECK-NEXT: call void @llvm.dbg.value({{.*}}) -; CHECK-NEXT: !"" +; CHECK: invalid #dbg record variable +; CHECK-NEXT: #dbg_value({{.*}}) ; CHECK: warning: ignoring invalid debug info define void @foo(i32 %a) { diff --git a/llvm/unittests/IR/DebugInfoTest.cpp b/llvm/unittests/IR/DebugInfoTest.cpp index a888fd6c6cdc3..d7aa584bb8cb4 100644 --- a/llvm/unittests/IR/DebugInfoTest.cpp +++ b/llvm/unittests/IR/DebugInfoTest.cpp @@ -991,7 +991,6 @@ TEST(MetadataTest, ConvertDbgToDbgVariableRecord) { Instruction *RetInst = &*std::next(FirstInst->getIterator()); // Set-up DbgMarkers in this block. - ExitBlock->IsNewDbgInfoFormat = true; ExitBlock->createMarker(FirstInst); ExitBlock->createMarker(RetInst); @@ -1127,7 +1126,6 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) { BasicBlock *BB1 = &F->getEntryBlock(); // First instruction should be a dbg.value. EXPECT_TRUE(isa(BB1->front())); - EXPECT_FALSE(BB1->IsNewDbgInfoFormat); // Validating the block for DbgVariableRecords / DbgMarkers shouldn't fail -- // there's no data stored right now. bool BrokenDebugInfo = false; @@ -1135,15 +1133,8 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) { EXPECT_FALSE(Error); EXPECT_FALSE(BrokenDebugInfo); - // Function and module should be marked as not having the new format too. - EXPECT_FALSE(F->IsNewDbgInfoFormat); - EXPECT_FALSE(M->IsNewDbgInfoFormat); - // Now convert. M->convertToNewDbgValues(); - EXPECT_TRUE(M->IsNewDbgInfoFormat); - EXPECT_TRUE(F->IsNewDbgInfoFormat); - EXPECT_TRUE(BB1->IsNewDbgInfoFormat); // There should now be no dbg.value instructions! // Ensure the first instruction exists, the test all of them. @@ -1180,7 +1171,6 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) { // There should be no DbgVariableRecords / DbgMarkers in the second block, but // it should be marked as being in the new format. BasicBlock *BB2 = BB1->getNextNode(); - EXPECT_TRUE(BB2->IsNewDbgInfoFormat); for (auto &Inst : *BB2) // Either there should be no marker, or it should be empty. EXPECT_TRUE(!Inst.DebugMarker || @@ -1207,9 +1197,6 @@ TEST(MetadataTest, DbgVariableRecordConversionRoutines) { // Convert everything back to the "old" format and ensure it's right. M->convertFromNewDbgValues(); - EXPECT_FALSE(M->IsNewDbgInfoFormat); - EXPECT_FALSE(F->IsNewDbgInfoFormat); - EXPECT_FALSE(BB1->IsNewDbgInfoFormat); EXPECT_EQ(BB1->size(), 4u); ASSERT_TRUE(isa(BB1->front())); From e15d50d5ff295368edaf7bff67f405617310722c Mon Sep 17 00:00:00 2001 From: Darren Wihandi <65404740+fairywreath@users.noreply.github.com> Date: Wed, 11 Jun 2025 09:20:40 -0400 Subject: [PATCH 074/851] [mlir][spirv] Add lowering of multiple math trig/hypb functions (#143604) Add Math to SPIRV lowering for tan, asin, acos, sinh, cosh, asinh, acosh and atanh. This completes the lowering of all trigonometric and hyperbolic functions from math to SPIRV. --- .../Conversion/MathToSPIRV/MathToSPIRV.cpp | 20 ++++++++++-- .../MathToSPIRV/math-to-gl-spirv.mlir | 32 +++++++++++++++++++ .../MathToSPIRV/math-to-opencl-spirv.mlir | 32 +++++++++++++++++++ 3 files changed, 82 insertions(+), 2 deletions(-) diff --git a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp index 1b83794b5f450..501bfa223fb18 100644 --- a/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp +++ b/mlir/lib/Conversion/MathToSPIRV/MathToSPIRV.cpp @@ -509,7 +509,15 @@ void populateMathToSPIRVPatterns(const SPIRVTypeConverter &typeConverter, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern>( + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern>( typeConverter, patterns.getContext()); // OpenCL patterns @@ -533,7 +541,15 @@ void populateMathToSPIRVPatterns(const SPIRVTypeConverter &typeConverter, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, CheckedElementwiseOpPattern, - CheckedElementwiseOpPattern>( + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern, + CheckedElementwiseOpPattern>( typeConverter, patterns.getContext()); } diff --git a/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir b/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir index 5c6561c104389..b8e001c9f6950 100644 --- a/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir +++ b/mlir/test/Conversion/MathToSPIRV/math-to-gl-spirv.mlir @@ -46,6 +46,22 @@ func.func @float32_unary_scalar(%arg0: f32) { %14 = math.ceil %arg0 : f32 // CHECK: spirv.GL.Floor %{{.*}}: f32 %15 = math.floor %arg0 : f32 + // CHECK: spirv.GL.Tan %{{.*}}: f32 + %16 = math.tan %arg0 : f32 + // CHECK: spirv.GL.Asin %{{.*}}: f32 + %17 = math.asin %arg0 : f32 + // CHECK: spirv.GL.Acos %{{.*}}: f32 + %18 = math.acos %arg0 : f32 + // CHECK: spirv.GL.Sinh %{{.*}}: f32 + %19 = math.sinh %arg0 : f32 + // CHECK: spirv.GL.Cosh %{{.*}}: f32 + %20 = math.cosh %arg0 : f32 + // CHECK: spirv.GL.Asinh %{{.*}}: f32 + %21 = math.asinh %arg0 : f32 + // CHECK: spirv.GL.Acosh %{{.*}}: f32 + %22 = math.acosh %arg0 : f32 + // CHECK: spirv.GL.Atanh %{{.*}}: f32 + %23 = math.atanh %arg0 : f32 return } @@ -85,6 +101,22 @@ func.func @float32_unary_vector(%arg0: vector<3xf32>) { %11 = math.tanh %arg0 : vector<3xf32> // CHECK: spirv.GL.Sin %{{.*}}: vector<3xf32> %12 = math.sin %arg0 : vector<3xf32> + // CHECK: spirv.GL.Tan %{{.*}}: vector<3xf32> + %13 = math.tan %arg0 : vector<3xf32> + // CHECK: spirv.GL.Asin %{{.*}}: vector<3xf32> + %14 = math.asin %arg0 : vector<3xf32> + // CHECK: spirv.GL.Acos %{{.*}}: vector<3xf32> + %15 = math.acos %arg0 : vector<3xf32> + // CHECK: spirv.GL.Sinh %{{.*}}: vector<3xf32> + %16 = math.sinh %arg0 : vector<3xf32> + // CHECK: spirv.GL.Cosh %{{.*}}: vector<3xf32> + %17 = math.cosh %arg0 : vector<3xf32> + // CHECK: spirv.GL.Asinh %{{.*}}: vector<3xf32> + %18 = math.asinh %arg0 : vector<3xf32> + // CHECK: spirv.GL.Acosh %{{.*}}: vector<3xf32> + %19 = math.acosh %arg0 : vector<3xf32> + // CHECK: spirv.GL.Atanh %{{.*}}: vector<3xf32> + %20 = math.atanh %arg0 : vector<3xf32> return } diff --git a/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir b/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir index 393a910c1fb1d..56a0d4dafec8c 100644 --- a/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir +++ b/mlir/test/Conversion/MathToSPIRV/math-to-opencl-spirv.mlir @@ -48,6 +48,22 @@ func.func @float32_unary_scalar(%arg0: f32) { %16 = math.erf %arg0 : f32 // CHECK: spirv.CL.round %{{.*}}: f32 %17 = math.round %arg0 : f32 + // CHECK: spirv.CL.tan %{{.*}}: f32 + %18 = math.tan %arg0 : f32 + // CHECK: spirv.CL.asin %{{.*}}: f32 + %19 = math.asin %arg0 : f32 + // CHECK: spirv.CL.acos %{{.*}}: f32 + %20 = math.acos %arg0 : f32 + // CHECK: spirv.CL.sinh %{{.*}}: f32 + %21 = math.sinh %arg0 : f32 + // CHECK: spirv.CL.cosh %{{.*}}: f32 + %22 = math.cosh %arg0 : f32 + // CHECK: spirv.CL.asinh %{{.*}}: f32 + %23 = math.asinh %arg0 : f32 + // CHECK: spirv.CL.acosh %{{.*}}: f32 + %24 = math.acosh %arg0 : f32 + // CHECK: spirv.CL.atanh %{{.*}}: f32 + %25 = math.atanh %arg0 : f32 return } @@ -87,6 +103,22 @@ func.func @float32_unary_vector(%arg0: vector<3xf32>) { %11 = math.tanh %arg0 : vector<3xf32> // CHECK: spirv.CL.sin %{{.*}}: vector<3xf32> %12 = math.sin %arg0 : vector<3xf32> + // CHECK: spirv.CL.tan %{{.*}}: vector<3xf32> + %13 = math.tan %arg0 : vector<3xf32> + // CHECK: spirv.CL.asin %{{.*}}: vector<3xf32> + %14 = math.asin %arg0 : vector<3xf32> + // CHECK: spirv.CL.acos %{{.*}}: vector<3xf32> + %15 = math.acos %arg0 : vector<3xf32> + // CHECK: spirv.CL.sinh %{{.*}}: vector<3xf32> + %16 = math.sinh %arg0 : vector<3xf32> + // CHECK: spirv.CL.cosh %{{.*}}: vector<3xf32> + %17 = math.cosh %arg0 : vector<3xf32> + // CHECK: spirv.CL.asinh %{{.*}}: vector<3xf32> + %18 = math.asinh %arg0 : vector<3xf32> + // CHECK: spirv.CL.acosh %{{.*}}: vector<3xf32> + %19 = math.acosh %arg0 : vector<3xf32> + // CHECK: spirv.CL.atanh %{{.*}}: vector<3xf32> + %20 = math.atanh %arg0 : vector<3xf32> return } From cc9f67416d048bf464425b5a9243219efcb08c34 Mon Sep 17 00:00:00 2001 From: Kajetan Puchalski Date: Wed, 11 Jun 2025 14:30:02 +0100 Subject: [PATCH 075/851] [flang][OpenMP] Consider previous DSA for static duration variables (#143601) Symbols that have a pre-existing DSA set in the enclosing context should not be made shared based on them being static duration variables. Suggested-by: Leandro Lupori --------- Signed-off-by: Kajetan Puchalski --- flang/lib/Semantics/resolve-directives.cpp | 4 +++- flang/test/Semantics/OpenMP/implicit-dsa.f90 | 22 ++++++++++++++++++++ 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp index 65823adcef19d..93bf510fbc3c7 100644 --- a/flang/lib/Semantics/resolve-directives.cpp +++ b/flang/lib/Semantics/resolve-directives.cpp @@ -2382,7 +2382,9 @@ void OmpAttributeVisitor::CreateImplicitSymbols(const Symbol *symbol) { dsa = prevDSA; } else if (taskGenDir) { // TODO 5) dummy arg in orphaned taskgen construct -> firstprivate - if (prevDSA.test(Symbol::Flag::OmpShared) || isStaticStorageDuration) { + if (prevDSA.test(Symbol::Flag::OmpShared) || + (isStaticStorageDuration && + (prevDSA & dataSharingAttributeFlags).none())) { // 6) shared in enclosing context -> shared dsa = {Symbol::Flag::OmpShared}; makeSymbol(dsa); diff --git a/flang/test/Semantics/OpenMP/implicit-dsa.f90 b/flang/test/Semantics/OpenMP/implicit-dsa.f90 index 3e9348575597b..4a07e256e2bb6 100644 --- a/flang/test/Semantics/OpenMP/implicit-dsa.f90 +++ b/flang/test/Semantics/OpenMP/implicit-dsa.f90 @@ -244,3 +244,25 @@ subroutine implicit_dsa_test_12 !REF: /implicit_dsa_test_12/tm3a print *,tm3a end subroutine + +! Test static duration variables with DSA set in the enclosing scope do not default to shared DSA +!DEF: /implicit_dsa_test_13_mod Module +module implicit_dsa_test_13_mod + !DEF: /implicit_dsa_test_13_mod/a PUBLIC ObjectEntity INTEGER(4) + integer::a=5 +contains + !DEF: /implicit_dsa_test_13_mod/implicit_dsa_test_13 PUBLIC (Subroutine) Subprogram + subroutine implicit_dsa_test_13 + !DEF: /implicit_dsa_test_13_mod/implicit_dsa_test_13/i ObjectEntity INTEGER(4) + integer i + !$omp do private(a) + !DEF: /implicit_dsa_test_13_mod/implicit_dsa_test_13/OtherConstruct1/i (OmpPrivate, OmpPreDetermined) HostAssoc INTEGER(4) + do i=0,10 + !$omp task + !DEF: /implicit_dsa_test_13_mod/implicit_dsa_test_13/OtherConstruct1/OtherConstruct1/a (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4) + !DEF: /implicit_dsa_test_13_mod/implicit_dsa_test_13/OtherConstruct1/OtherConstruct1/i (OmpFirstPrivate, OmpImplicit) HostAssoc INTEGER(4) + a=a+i + !$omp end task + end do + end subroutine implicit_dsa_test_13 +end module implicit_dsa_test_13_mod From b512077c373a4416c506002383c69867cfee0741 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Wed, 11 Jun 2025 06:34:46 -0700 Subject: [PATCH 076/851] [flang][runtime] Another try to fix build failure (#143702) Tweak accessibility to try to get code past whatever gcc is being used by the flang-runtime-cuda-gcc build bot. --- .../include/flang-rt/runtime/work-queue.h | 22 +++++++++++-------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h index f7f4777839836..f8cc820c06ca1 100644 --- a/flang-rt/include/flang-rt/runtime/work-queue.h +++ b/flang-rt/include/flang-rt/runtime/work-queue.h @@ -94,7 +94,7 @@ template class ImmediateTicketRunner { // Base class for ticket workers that operate elementwise over descriptors class Elementwise { -protected: +public: RT_API_ATTRS Elementwise( const Descriptor &instance, const Descriptor *from = nullptr) : instance_{instance}, from_{from} { @@ -120,6 +120,7 @@ class Elementwise { } } +protected: const Descriptor &instance_, *from_{nullptr}; std::size_t elements_{instance_.Elements()}; std::size_t elementAt_{0}; @@ -129,7 +130,7 @@ class Elementwise { // Base class for ticket workers that operate over derived type components. class Componentwise { -protected: +public: RT_API_ATTRS Componentwise(const typeInfo::DerivedType &); RT_API_ATTRS bool IsComplete() const { return componentAt_ >= components_; } RT_API_ATTRS void Advance() { @@ -147,6 +148,7 @@ class Componentwise { } RT_API_ATTRS void GetComponent(); +protected: const typeInfo::DerivedType &derived_; std::size_t components_{0}, componentAt_{0}; const typeInfo::Component *component_{nullptr}; @@ -155,8 +157,8 @@ class Componentwise { // Base class for ticket workers that operate over derived type components // in an outer loop, and elements in an inner loop. -class ComponentsOverElements : protected Componentwise, protected Elementwise { -protected: +class ComponentsOverElements : public Componentwise, public Elementwise { +public: RT_API_ATTRS ComponentsOverElements(const Descriptor &instance, const typeInfo::DerivedType &derived, const Descriptor *from = nullptr) : Componentwise{derived}, Elementwise{instance, from} { @@ -187,13 +189,14 @@ class ComponentsOverElements : protected Componentwise, protected Elementwise { Componentwise::Reset(); } +protected: int phase_{0}; }; // Base class for ticket workers that operate over elements in an outer loop, // type components in an inner loop. -class ElementsOverComponents : protected Elementwise, protected Componentwise { -protected: +class ElementsOverComponents : public Elementwise, public Componentwise { +public: RT_API_ATTRS ElementsOverComponents(const Descriptor &instance, const typeInfo::DerivedType &derived, const Descriptor *from = nullptr) : Elementwise{instance, from}, Componentwise{derived} { @@ -219,6 +222,7 @@ class ElementsOverComponents : protected Elementwise, protected Componentwise { Elementwise::Advance(); } +protected: int phase_{0}; }; @@ -319,7 +323,7 @@ class AssignTicket : public ImmediateTicketRunner { template class DerivedAssignTicket : public ImmediateTicketRunner>, - protected std::conditional_t { public: using Base = std::conditional_t class DescriptorIoTicket : public ImmediateTicketRunner>, - protected Elementwise { + private Elementwise { public: RT_API_ATTRS DescriptorIoTicket(io::IoStatementState &io, const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table, @@ -372,7 +376,7 @@ class DescriptorIoTicket template class DerivedIoTicket : public ImmediateTicketRunner>, - protected ElementsOverComponents { + private ElementsOverComponents { public: RT_API_ATTRS DerivedIoTicket(io::IoStatementState &io, const Descriptor &descriptor, const typeInfo::DerivedType &derived, From b09206db154bab8fa09b6708e642a6bba3d125be Mon Sep 17 00:00:00 2001 From: Igor Wodiany Date: Wed, 11 Jun 2025 14:37:28 +0100 Subject: [PATCH 077/851] [mlir][spirv] Include `SPIRV_AnyImage` in `SPIRV_Type` (#143676) This change is trigger by encountering the following error: ``` :0: error: 'spirv.Load' op result #0 must be void or bool or 8/16/32/64-bit integer or 16/32/64-bit float or vector of bool or 8/16/32/64-bit integer or 16/32/64-bit float values of length 2/3/4/8/16 or any SPIR-V pointer type or any SPIR-V array type or any SPIR-V run time array type or any SPIR-V struct type or any SPIR-V cooperative matrix type or any SPIR-V matrix type or any SPIR-V sampled image type, but got '!spirv.image':0: note: see current operation: %126 = "spirv.Load"(%125) {relaxed_precision} : (!spirv.ptr, UniformConstant>) -> !spirv.image ``` --- mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td | 3 ++- mlir/test/Dialect/SPIRV/IR/memory-ops.mlir | 10 ++++++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td index 8fd533db83d9a..b143cf9a5f509 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVBase.td @@ -4196,7 +4196,8 @@ def SPIRV_Composite : def SPIRV_Type : AnyTypeOf<[ SPIRV_Void, SPIRV_Bool, SPIRV_Integer, SPIRV_Float, SPIRV_Vector, SPIRV_AnyPtr, SPIRV_AnyArray, SPIRV_AnyRTArray, SPIRV_AnyStruct, - SPIRV_AnyCooperativeMatrix, SPIRV_AnyMatrix, SPIRV_AnySampledImage + SPIRV_AnyCooperativeMatrix, SPIRV_AnyMatrix, SPIRV_AnySampledImage, + SPIRV_AnyImage ]>; def SPIRV_SignedInt : SignedIntOfWidths<[8, 16, 32, 64]>; diff --git a/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir b/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir index 57ff94762ff68..a3b96c698a344 100644 --- a/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir +++ b/mlir/test/Dialect/SPIRV/IR/memory-ops.mlir @@ -356,6 +356,16 @@ spirv.module Logical GLSL450 { // ----- +// CHECK-LABEL: @image_load +func.func @image_load() -> () { + %0 = spirv.Variable : !spirv.ptr, Function> + // CHECK: spirv.Load "Function" %{{.*}} : !spirv.image + %1 = spirv.Load "Function" %0 : !spirv.image + return +} + +// ----- + //===----------------------------------------------------------------------===// // spirv.StoreOp //===----------------------------------------------------------------------===// From 6b0cb762af97579ca8ff5eea9be896169a1752b7 Mon Sep 17 00:00:00 2001 From: Corentin Jabot Date: Wed, 11 Jun 2025 15:39:41 +0200 Subject: [PATCH 078/851] [Clang] _default-movable_ should be based on the first declaration (#143661) When the definition of a special member function was defaulted we would not consider it user-provided, even when the first declaration was not defaulted. Fixes #143599 --- clang/lib/Sema/SemaTypeTraits.cpp | 16 ++++++++----- .../SemaCXX/cxx2c-trivially-relocatable.cpp | 21 +++++++++++++++++ .../SemaCXX/type-traits-unsatisfied-diags.cpp | 23 +++++++++++++++++++ 3 files changed, 54 insertions(+), 6 deletions(-) diff --git a/clang/lib/Sema/SemaTypeTraits.cpp b/clang/lib/Sema/SemaTypeTraits.cpp index d663e5581093e..1738ab4466001 100644 --- a/clang/lib/Sema/SemaTypeTraits.cpp +++ b/clang/lib/Sema/SemaTypeTraits.cpp @@ -105,7 +105,7 @@ static CXXMethodDecl *LookupSpecialMemberFromXValue(Sema &SemaRef, switch (OCS.BestViableFunction(SemaRef, LookupLoc, Best)) { case OR_Success: case OR_Deleted: - return cast(Best->Function); + return cast(Best->Function)->getCanonicalDecl(); default: return nullptr; } @@ -164,6 +164,8 @@ static bool IsDefaultMovable(Sema &SemaRef, const CXXRecordDecl *D) { if (!Dtr) return true; + Dtr = Dtr->getCanonicalDecl(); + if (Dtr->isUserProvided() && (!Dtr->isDefaulted() || Dtr->isDeleted())) return false; @@ -2044,11 +2046,13 @@ static void DiagnoseNonDefaultMovable(Sema &SemaRef, SourceLocation Loc, << diag::TraitNotSatisfiedReason::UserProvidedAssign << Decl->isMoveAssignmentOperator() << Decl->getSourceRange(); } - CXXDestructorDecl *Dtr = D->getDestructor(); - if (Dtr && Dtr->isUserProvided() && !Dtr->isDefaulted()) - SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason) - << diag::TraitNotSatisfiedReason::DeletedDtr << /*User Provided*/ 1 - << Dtr->getSourceRange(); + if (CXXDestructorDecl *Dtr = D->getDestructor()) { + Dtr = Dtr->getCanonicalDecl(); + if (Dtr->isUserProvided() && !Dtr->isDefaulted()) + SemaRef.Diag(Loc, diag::note_unsatisfied_trait_reason) + << diag::TraitNotSatisfiedReason::DeletedDtr << /*User Provided*/ 1 + << Dtr->getSourceRange(); + } } static void DiagnoseNonTriviallyRelocatableReason(Sema &SemaRef, diff --git a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp index aff172e0bc70a..9d43994ee7661 100644 --- a/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp +++ b/clang/test/SemaCXX/cxx2c-trivially-relocatable.cpp @@ -388,3 +388,24 @@ void do_test__builtin_trivially_relocate() { // expected-note@-1 {{'test__builtin_trivially_relocate' requested here}} // expected-error@#reloc1 {{first argument to '__builtin_trivially_relocate' must be relocatable}} } + + +namespace GH143599 { +struct A { ~A (); }; +A::~A () = default; + +static_assert (!__builtin_is_cpp_trivially_relocatable(A)); +static_assert (!__builtin_is_replaceable(A)); + +struct B { B(const B&); }; +B::B (const B&) = default; + +static_assert (!__builtin_is_cpp_trivially_relocatable(B)); +static_assert (!__builtin_is_replaceable(B)); + +struct C { C& operator=(const C&); }; +C& C::operator=(const C&) = default; + +static_assert (!__builtin_is_cpp_trivially_relocatable(C)); +static_assert (!__builtin_is_replaceable(C)); +} diff --git a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp index 9e053034acda4..a8c78f6304ca9 100644 --- a/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp +++ b/clang/test/SemaCXX/type-traits-unsatisfied-diags.cpp @@ -320,6 +320,29 @@ static_assert(__builtin_is_cpp_trivially_relocatable(UnionOfPolymorphic)); } +struct GH143599 { // expected-note 2 {{'GH143599' defined here}} + ~GH143599 (); + GH143599(const GH143599&); + GH143599& operator=(const GH143599&); +}; +GH143599::~GH143599 () = default; +GH143599::GH143599 (const GH143599&) = default; +GH143599& GH143599::operator=(const GH143599&) = default; + +static_assert (__builtin_is_cpp_trivially_relocatable(GH143599)); +// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_cpp_trivially_relocatable(GH143599)'}} \ +// expected-note@-1 {{'GH143599' is not trivially relocatable}} \ +// expected-note@-1 {{because it has a user provided copy constructor}} \ +// expected-note@-1 {{because it has a user provided copy assignment operator}} \ +// expected-note@-1 {{because it has a user-provided destructor}} + +static_assert (__builtin_is_replaceable(GH143599)); +// expected-error@-1 {{static assertion failed due to requirement '__builtin_is_replaceable(GH143599)'}} \ +// expected-note@-1 {{'GH143599' is not replaceable}} \ +// expected-note@-1 {{because it has a user provided copy constructor}} \ +// expected-note@-1 {{because it has a user provided copy assignment operator}} \ +// expected-note@-1 {{because it has a user-provided destructor}} + namespace trivially_copyable { struct B { virtual ~B(); From c71a2e688828ab3ede4fb54168a674ff68396f61 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Wed, 11 Jun 2025 14:43:15 +0100 Subject: [PATCH 079/851] [DebugInfo][RemoveDIs] Remove some debug intrinsic-only codepaths (#143451) These are opportunistic deletions as more places that make use of the IsNewDbgInfoFormat flag are removed. It should (TM)(R) all be dead code now that `IsNewDbgInfoFormat` should be true everywhere. FastISel: we don't need to do debug-aware instruction counting any more, because there are no debug instructions, Autoupgrade: you can no-longer avoid autoupgrading of intrinsics to records DIBuilder: Delete the code for creating debug intrinsics (!) LoopUtils: No need to handle debug instructions, they don't exist --- llvm/lib/CodeGen/SelectionDAG/FastISel.cpp | 3 - llvm/lib/IR/AutoUpgrade.cpp | 25 ++---- llvm/lib/IR/DIBuilder.cpp | 97 +++++----------------- llvm/lib/IR/DebugInfo.cpp | 19 +---- llvm/lib/Transforms/Utils/LoopUtils.cpp | 36 +++----- llvm/unittests/IR/IRBuilderTest.cpp | 10 --- 6 files changed, 40 insertions(+), 150 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 59cd0dc8dd348..e8a3df3366b2b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1671,9 +1671,6 @@ void FastISel::fastEmitBranch(MachineBasicBlock *MSucc, const DebugLoc &DbgLoc) { const BasicBlock *BB = FuncInfo.MBB->getBasicBlock(); bool BlockHasMultipleInstrs = &BB->front() != &BB->back(); - // Handle legacy case of debug intrinsics - if (BlockHasMultipleInstrs && !BB->getModule()->IsNewDbgInfoFormat) - BlockHasMultipleInstrs = BB->sizeWithoutDebug() > 1; if (BlockHasMultipleInstrs && FuncInfo.MBB->isLayoutSuccessor(MSucc)) { // For more accurate line information if this is the only non-debug // instruction in the block then emit it, otherwise we have the diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index cb90af36f3d9f..a0886776ff93f 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -4490,7 +4490,6 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { Builder.SetInsertPoint(CI->getParent(), CI->getIterator()); if (!NewFn) { - bool FallthroughToDefaultUpgrade = false; // Get the Function's name. StringRef Name = F->getName(); @@ -4518,29 +4517,15 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { } else if (IsAMDGCN) { Rep = upgradeAMDGCNIntrinsicCall(Name, CI, F, Builder); } else if (IsDbg) { - // We might have decided we don't want the new format after all between - // first requesting the upgrade and now; skip the conversion if that is - // the case, and check here to see if the intrinsic needs to be upgraded - // normally. - if (!CI->getModule()->IsNewDbgInfoFormat) { - bool NeedsUpgrade = - upgradeIntrinsicFunction1(CI->getCalledFunction(), NewFn, false); - if (!NeedsUpgrade) - return; - FallthroughToDefaultUpgrade = true; - } else { - upgradeDbgIntrinsicToDbgRecord(Name, CI); - } + upgradeDbgIntrinsicToDbgRecord(Name, CI); } else { llvm_unreachable("Unknown function for CallBase upgrade."); } - if (!FallthroughToDefaultUpgrade) { - if (Rep) - CI->replaceAllUsesWith(Rep); - CI->eraseFromParent(); - return; - } + if (Rep) + CI->replaceAllUsesWith(Rep); + CI->eraseFromParent(); + return; } const auto &DefaultCase = [&]() -> void { diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 5e5ff22132e99..1484c549dd580 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -1047,36 +1047,13 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val, LinkedInstr->getMetadata(LLVMContext::MD_DIAssignID)); assert(Link && "Linked instruction must have DIAssign metadata attached"); - if (M.IsNewDbgInfoFormat) { - DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign( - Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL); - // Insert after LinkedInstr. - BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator()); - NextIt.setHeadBit(true); - insertDbgVariableRecord(DVR, NextIt); - return DVR; - } - - LLVMContext &Ctx = LinkedInstr->getContext(); - Module *M = LinkedInstr->getModule(); - if (!AssignFn) - AssignFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_assign); - - std::array Args = { - MetadataAsValue::get(Ctx, ValueAsMetadata::get(Val)), - MetadataAsValue::get(Ctx, SrcVar), - MetadataAsValue::get(Ctx, ValExpr), - MetadataAsValue::get(Ctx, Link), - MetadataAsValue::get(Ctx, ValueAsMetadata::get(Addr)), - MetadataAsValue::get(Ctx, AddrExpr), - }; - - IRBuilder<> B(Ctx); - B.SetCurrentDebugLocation(DL); - - auto *DVI = cast(B.CreateCall(AssignFn, Args)); - DVI->insertAfter(LinkedInstr->getIterator()); - return DVI; + DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign( + Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL); + // Insert after LinkedInstr. + BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator()); + NextIt.setHeadBit(true); + insertDbgVariableRecord(DVR, NextIt); + return DVR; } /// Initialize IRBuilder for inserting dbg.declare and dbg.value intrinsics. @@ -1101,18 +1078,10 @@ DbgInstPtr DIBuilder::insertDbgValueIntrinsic(llvm::Value *Val, DIExpression *Expr, const DILocation *DL, InsertPosition InsertPt) { - if (M.IsNewDbgInfoFormat) { - DbgVariableRecord *DVR = - DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL); - insertDbgVariableRecord(DVR, InsertPt); - return DVR; - } - - if (!ValueFn) - ValueFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_value); - auto *DVI = insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertPt); - cast(DVI)->setTailCall(); - return DVI; + DbgVariableRecord *DVR = + DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL); + insertDbgVariableRecord(DVR, InsertPt); + return DVR; } DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, @@ -1124,25 +1093,10 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, VarInfo->getScope()->getSubprogram() && "Expected matching subprograms"); - if (M.IsNewDbgInfoFormat) { - DbgVariableRecord *DVR = - DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL); - insertDbgVariableRecord(DVR, InsertPt); - return DVR; - } - - if (!DeclareFn) - DeclareFn = getDeclareIntrin(M); - - trackIfUnresolved(VarInfo); - trackIfUnresolved(Expr); - Value *Args[] = {getDbgIntrinsicValueImpl(VMContext, Storage), - MetadataAsValue::get(VMContext, VarInfo), - MetadataAsValue::get(VMContext, Expr)}; - - IRBuilder<> B(DL->getContext()); - initIRBuilder(B, DL, InsertPt); - return B.CreateCall(DeclareFn, Args); + DbgVariableRecord *DVR = + DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL); + insertDbgVariableRecord(DVR, InsertPt); + return DVR; } void DIBuilder::insertDbgVariableRecord(DbgVariableRecord *DVR, @@ -1191,23 +1145,12 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL, "Expected matching subprograms"); trackIfUnresolved(LabelInfo); - if (M.IsNewDbgInfoFormat) { - DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL); - if (InsertPt.isValid()) { - auto *BB = InsertPt.getBasicBlock(); - BB->insertDbgRecordBefore(DLR, InsertPt); - } - return DLR; + DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL); + if (InsertPt.isValid()) { + auto *BB = InsertPt.getBasicBlock(); + BB->insertDbgRecordBefore(DLR, InsertPt); } - - if (!LabelFn) - LabelFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_label); - - Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)}; - - IRBuilder<> B(DL->getContext()); - initIRBuilder(B, DL, InsertPt); - return B.CreateCall(LabelFn, Args); + return DLR; } void DIBuilder::replaceVTableHolder(DICompositeType *&T, DIType *VTableHolder) { diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 7db9891fdbd75..2a84e7bae0f10 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -2123,22 +2123,11 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest, Expr = *R; } DIExpression *AddrExpr = DIExpression::get(StoreLikeInst.getContext(), {}); - if (StoreLikeInst.getParent()->IsNewDbgInfoFormat) { - auto *Assign = DbgVariableRecord::createLinkedDVRAssign( - &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL); - (void)Assign; - LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n"); - return; - } - auto Assign = DIB.insertDbgAssign(&StoreLikeInst, Val, VarRec.Var, Expr, Dest, - AddrExpr, VarRec.DL); + auto *Assign = DbgVariableRecord::createLinkedDVRAssign( + &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL); (void)Assign; - LLVM_DEBUG(if (!Assign.isNull()) { - if (const auto *Record = dyn_cast(Assign)) - errs() << " > INSERT: " << *Record << "\n"; - else - errs() << " > INSERT: " << *cast(Assign) << "\n"; - }); + LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n"); + return; } #undef DEBUG_TYPE // Silence redefinition warning (from ConstantsContext.h). diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 0681ebc111cb2..ff69fa9f70c4e 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -606,7 +606,6 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, // Use a map to unique and a vector to guarantee deterministic ordering. llvm::SmallDenseSet DeadDebugSet; - llvm::SmallVector DeadDebugInst; llvm::SmallVector DeadDbgVariableRecords; if (ExitBlock) { @@ -633,29 +632,19 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, U.set(Poison); } - // RemoveDIs: do the same as below for DbgVariableRecords. - if (Block->IsNewDbgInfoFormat) { - for (DbgVariableRecord &DVR : llvm::make_early_inc_range( - filterDbgVars(I.getDbgRecordRange()))) { - DebugVariable Key(DVR.getVariable(), DVR.getExpression(), - DVR.getDebugLoc().get()); - if (!DeadDebugSet.insert(Key).second) - continue; - // Unlinks the DVR from it's container, for later insertion. - DVR.removeFromParent(); - DeadDbgVariableRecords.push_back(&DVR); - } - } - - // For one of each variable encountered, preserve a debug intrinsic (set + // For one of each variable encountered, preserve a debug record (set // to Poison) and transfer it to the loop exit. This terminates any // variable locations that were set during the loop. - auto *DVI = dyn_cast(&I); - if (!DVI) - continue; - if (!DeadDebugSet.insert(DebugVariable(DVI)).second) - continue; - DeadDebugInst.push_back(DVI); + for (DbgVariableRecord &DVR : + llvm::make_early_inc_range(filterDbgVars(I.getDbgRecordRange()))) { + DebugVariable Key(DVR.getVariable(), DVR.getExpression(), + DVR.getDebugLoc().get()); + if (!DeadDebugSet.insert(Key).second) + continue; + // Unlinks the DVR from it's container, for later insertion. + DVR.removeFromParent(); + DeadDbgVariableRecords.push_back(&DVR); + } } // After the loop has been deleted all the values defined and modified @@ -671,9 +660,6 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, "There should be a non-PHI instruction in exit block, else these " "instructions will have no parent."); - for (auto *DVI : DeadDebugInst) - DVI->moveBefore(*ExitBlock, InsertDbgValueBefore); - // Due to the "head" bit in BasicBlock::iterator, we're going to insert // each DbgVariableRecord right at the start of the block, wheras dbg.values // would be repeatedly inserted before the first instruction. To replicate diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp index 3a7ba924792ef..aadae5287c380 100644 --- a/llvm/unittests/IR/IRBuilderTest.cpp +++ b/llvm/unittests/IR/IRBuilderTest.cpp @@ -1003,18 +1003,8 @@ TEST_F(IRBuilderTest, DIBuilder) { EXPECT_TRUE(verifyModule(*M)); }; - // Test in new-debug mode. - EXPECT_TRUE(M->IsNewDbgInfoFormat); RunTest(); - - // Test in old-debug mode. - // Reset the test then call convertFromNewDbgValues to flip the flag - // on the test's Module, Function and BasicBlock. TearDown(); - SetUp(); - M->convertFromNewDbgValues(); - EXPECT_FALSE(M->IsNewDbgInfoFormat); - RunTest(); } TEST_F(IRBuilderTest, createArtificialSubprogram) { From 46d9abbba2ad63c0280d4248cc2349de78439294 Mon Sep 17 00:00:00 2001 From: David Truby Date: Wed, 11 Jun 2025 14:50:39 +0100 Subject: [PATCH 080/851] [flang] Add David Truby as maintainer for Flang on Windows (#142619) --- flang/Maintainers.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/flang/Maintainers.md b/flang/Maintainers.md index f4a7635389138..b994c300e4e2c 100644 --- a/flang/Maintainers.md +++ b/flang/Maintainers.md @@ -79,6 +79,13 @@ clementval@gmail.com (email), clementval (GitHub), clementval (Discourse) Abid Qadeer \ haqadeer@amd.com (email), abidh (GitHub), abidh (Discourse) +### Platform maintainers +These maintainers are responsible for particular platforms that Flang supports + +#### Windows +David Truby \ +david.truby@arm.com (email), davidtruby (GitHub), davidtruby (Discourse), truby (Discord) + ## Inactive Maintainers ### Lead Maintainers #### Backend : (Lowering, FIR, Codegen) From 76197ea6f91f802467f2614e1217e99eb4037200 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Wed, 11 Jun 2025 14:51:13 +0100 Subject: [PATCH 081/851] Revert "[DebugInfo][RemoveDIs] Remove some debug intrinsic-only codepaths (#143451)" This reverts commit c71a2e688828ab3ede4fb54168a674ff68396f61. /me squints -- this is hitting an assertion I thought had been deleted, will revert and investigate for a bit. --- llvm/lib/CodeGen/SelectionDAG/FastISel.cpp | 3 + llvm/lib/IR/AutoUpgrade.cpp | 25 ++++-- llvm/lib/IR/DIBuilder.cpp | 97 +++++++++++++++++----- llvm/lib/IR/DebugInfo.cpp | 19 ++++- llvm/lib/Transforms/Utils/LoopUtils.cpp | 36 +++++--- llvm/unittests/IR/IRBuilderTest.cpp | 10 +++ 6 files changed, 150 insertions(+), 40 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index e8a3df3366b2b..59cd0dc8dd348 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1671,6 +1671,9 @@ void FastISel::fastEmitBranch(MachineBasicBlock *MSucc, const DebugLoc &DbgLoc) { const BasicBlock *BB = FuncInfo.MBB->getBasicBlock(); bool BlockHasMultipleInstrs = &BB->front() != &BB->back(); + // Handle legacy case of debug intrinsics + if (BlockHasMultipleInstrs && !BB->getModule()->IsNewDbgInfoFormat) + BlockHasMultipleInstrs = BB->sizeWithoutDebug() > 1; if (BlockHasMultipleInstrs && FuncInfo.MBB->isLayoutSuccessor(MSucc)) { // For more accurate line information if this is the only non-debug // instruction in the block then emit it, otherwise we have the diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index a0886776ff93f..cb90af36f3d9f 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -4490,6 +4490,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { Builder.SetInsertPoint(CI->getParent(), CI->getIterator()); if (!NewFn) { + bool FallthroughToDefaultUpgrade = false; // Get the Function's name. StringRef Name = F->getName(); @@ -4517,15 +4518,29 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { } else if (IsAMDGCN) { Rep = upgradeAMDGCNIntrinsicCall(Name, CI, F, Builder); } else if (IsDbg) { - upgradeDbgIntrinsicToDbgRecord(Name, CI); + // We might have decided we don't want the new format after all between + // first requesting the upgrade and now; skip the conversion if that is + // the case, and check here to see if the intrinsic needs to be upgraded + // normally. + if (!CI->getModule()->IsNewDbgInfoFormat) { + bool NeedsUpgrade = + upgradeIntrinsicFunction1(CI->getCalledFunction(), NewFn, false); + if (!NeedsUpgrade) + return; + FallthroughToDefaultUpgrade = true; + } else { + upgradeDbgIntrinsicToDbgRecord(Name, CI); + } } else { llvm_unreachable("Unknown function for CallBase upgrade."); } - if (Rep) - CI->replaceAllUsesWith(Rep); - CI->eraseFromParent(); - return; + if (!FallthroughToDefaultUpgrade) { + if (Rep) + CI->replaceAllUsesWith(Rep); + CI->eraseFromParent(); + return; + } } const auto &DefaultCase = [&]() -> void { diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 1484c549dd580..5e5ff22132e99 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -1047,13 +1047,36 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val, LinkedInstr->getMetadata(LLVMContext::MD_DIAssignID)); assert(Link && "Linked instruction must have DIAssign metadata attached"); - DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign( - Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL); - // Insert after LinkedInstr. - BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator()); - NextIt.setHeadBit(true); - insertDbgVariableRecord(DVR, NextIt); - return DVR; + if (M.IsNewDbgInfoFormat) { + DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign( + Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL); + // Insert after LinkedInstr. + BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator()); + NextIt.setHeadBit(true); + insertDbgVariableRecord(DVR, NextIt); + return DVR; + } + + LLVMContext &Ctx = LinkedInstr->getContext(); + Module *M = LinkedInstr->getModule(); + if (!AssignFn) + AssignFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_assign); + + std::array Args = { + MetadataAsValue::get(Ctx, ValueAsMetadata::get(Val)), + MetadataAsValue::get(Ctx, SrcVar), + MetadataAsValue::get(Ctx, ValExpr), + MetadataAsValue::get(Ctx, Link), + MetadataAsValue::get(Ctx, ValueAsMetadata::get(Addr)), + MetadataAsValue::get(Ctx, AddrExpr), + }; + + IRBuilder<> B(Ctx); + B.SetCurrentDebugLocation(DL); + + auto *DVI = cast(B.CreateCall(AssignFn, Args)); + DVI->insertAfter(LinkedInstr->getIterator()); + return DVI; } /// Initialize IRBuilder for inserting dbg.declare and dbg.value intrinsics. @@ -1078,10 +1101,18 @@ DbgInstPtr DIBuilder::insertDbgValueIntrinsic(llvm::Value *Val, DIExpression *Expr, const DILocation *DL, InsertPosition InsertPt) { - DbgVariableRecord *DVR = - DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL); - insertDbgVariableRecord(DVR, InsertPt); - return DVR; + if (M.IsNewDbgInfoFormat) { + DbgVariableRecord *DVR = + DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL); + insertDbgVariableRecord(DVR, InsertPt); + return DVR; + } + + if (!ValueFn) + ValueFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_value); + auto *DVI = insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertPt); + cast(DVI)->setTailCall(); + return DVI; } DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, @@ -1093,10 +1124,25 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, VarInfo->getScope()->getSubprogram() && "Expected matching subprograms"); - DbgVariableRecord *DVR = - DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL); - insertDbgVariableRecord(DVR, InsertPt); - return DVR; + if (M.IsNewDbgInfoFormat) { + DbgVariableRecord *DVR = + DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL); + insertDbgVariableRecord(DVR, InsertPt); + return DVR; + } + + if (!DeclareFn) + DeclareFn = getDeclareIntrin(M); + + trackIfUnresolved(VarInfo); + trackIfUnresolved(Expr); + Value *Args[] = {getDbgIntrinsicValueImpl(VMContext, Storage), + MetadataAsValue::get(VMContext, VarInfo), + MetadataAsValue::get(VMContext, Expr)}; + + IRBuilder<> B(DL->getContext()); + initIRBuilder(B, DL, InsertPt); + return B.CreateCall(DeclareFn, Args); } void DIBuilder::insertDbgVariableRecord(DbgVariableRecord *DVR, @@ -1145,12 +1191,23 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL, "Expected matching subprograms"); trackIfUnresolved(LabelInfo); - DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL); - if (InsertPt.isValid()) { - auto *BB = InsertPt.getBasicBlock(); - BB->insertDbgRecordBefore(DLR, InsertPt); + if (M.IsNewDbgInfoFormat) { + DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL); + if (InsertPt.isValid()) { + auto *BB = InsertPt.getBasicBlock(); + BB->insertDbgRecordBefore(DLR, InsertPt); + } + return DLR; } - return DLR; + + if (!LabelFn) + LabelFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_label); + + Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)}; + + IRBuilder<> B(DL->getContext()); + initIRBuilder(B, DL, InsertPt); + return B.CreateCall(LabelFn, Args); } void DIBuilder::replaceVTableHolder(DICompositeType *&T, DIType *VTableHolder) { diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 2a84e7bae0f10..7db9891fdbd75 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -2123,11 +2123,22 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest, Expr = *R; } DIExpression *AddrExpr = DIExpression::get(StoreLikeInst.getContext(), {}); - auto *Assign = DbgVariableRecord::createLinkedDVRAssign( - &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL); + if (StoreLikeInst.getParent()->IsNewDbgInfoFormat) { + auto *Assign = DbgVariableRecord::createLinkedDVRAssign( + &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL); + (void)Assign; + LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n"); + return; + } + auto Assign = DIB.insertDbgAssign(&StoreLikeInst, Val, VarRec.Var, Expr, Dest, + AddrExpr, VarRec.DL); (void)Assign; - LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n"); - return; + LLVM_DEBUG(if (!Assign.isNull()) { + if (const auto *Record = dyn_cast(Assign)) + errs() << " > INSERT: " << *Record << "\n"; + else + errs() << " > INSERT: " << *cast(Assign) << "\n"; + }); } #undef DEBUG_TYPE // Silence redefinition warning (from ConstantsContext.h). diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index ff69fa9f70c4e..0681ebc111cb2 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -606,6 +606,7 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, // Use a map to unique and a vector to guarantee deterministic ordering. llvm::SmallDenseSet DeadDebugSet; + llvm::SmallVector DeadDebugInst; llvm::SmallVector DeadDbgVariableRecords; if (ExitBlock) { @@ -632,19 +633,29 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, U.set(Poison); } - // For one of each variable encountered, preserve a debug record (set + // RemoveDIs: do the same as below for DbgVariableRecords. + if (Block->IsNewDbgInfoFormat) { + for (DbgVariableRecord &DVR : llvm::make_early_inc_range( + filterDbgVars(I.getDbgRecordRange()))) { + DebugVariable Key(DVR.getVariable(), DVR.getExpression(), + DVR.getDebugLoc().get()); + if (!DeadDebugSet.insert(Key).second) + continue; + // Unlinks the DVR from it's container, for later insertion. + DVR.removeFromParent(); + DeadDbgVariableRecords.push_back(&DVR); + } + } + + // For one of each variable encountered, preserve a debug intrinsic (set // to Poison) and transfer it to the loop exit. This terminates any // variable locations that were set during the loop. - for (DbgVariableRecord &DVR : - llvm::make_early_inc_range(filterDbgVars(I.getDbgRecordRange()))) { - DebugVariable Key(DVR.getVariable(), DVR.getExpression(), - DVR.getDebugLoc().get()); - if (!DeadDebugSet.insert(Key).second) - continue; - // Unlinks the DVR from it's container, for later insertion. - DVR.removeFromParent(); - DeadDbgVariableRecords.push_back(&DVR); - } + auto *DVI = dyn_cast(&I); + if (!DVI) + continue; + if (!DeadDebugSet.insert(DebugVariable(DVI)).second) + continue; + DeadDebugInst.push_back(DVI); } // After the loop has been deleted all the values defined and modified @@ -660,6 +671,9 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, "There should be a non-PHI instruction in exit block, else these " "instructions will have no parent."); + for (auto *DVI : DeadDebugInst) + DVI->moveBefore(*ExitBlock, InsertDbgValueBefore); + // Due to the "head" bit in BasicBlock::iterator, we're going to insert // each DbgVariableRecord right at the start of the block, wheras dbg.values // would be repeatedly inserted before the first instruction. To replicate diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp index aadae5287c380..3a7ba924792ef 100644 --- a/llvm/unittests/IR/IRBuilderTest.cpp +++ b/llvm/unittests/IR/IRBuilderTest.cpp @@ -1003,8 +1003,18 @@ TEST_F(IRBuilderTest, DIBuilder) { EXPECT_TRUE(verifyModule(*M)); }; + // Test in new-debug mode. + EXPECT_TRUE(M->IsNewDbgInfoFormat); RunTest(); + + // Test in old-debug mode. + // Reset the test then call convertFromNewDbgValues to flip the flag + // on the test's Module, Function and BasicBlock. TearDown(); + SetUp(); + M->convertFromNewDbgValues(); + EXPECT_FALSE(M->IsNewDbgInfoFormat); + RunTest(); } TEST_F(IRBuilderTest, createArtificialSubprogram) { From 6fb2a80189016bd4222b174ae4d72e47d0aa58ff Mon Sep 17 00:00:00 2001 From: Davide Grohmann <6573166+davidegrohmann@users.noreply.github.com> Date: Wed, 11 Jun 2025 15:56:38 +0200 Subject: [PATCH 082/851] [mlir][spirv] Truncate Literal String size at max number words (#142916) If not truncated the SPIRV serialization would not fail but instead produce an invalid SPIR-V module. --------- Signed-off-by: Davide Grohmann --- .../mlir/Target/SPIRV/SPIRVBinaryUtils.h | 7 +++++++ mlir/lib/Target/SPIRV/SPIRVBinaryUtils.cpp | 20 ++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Target/SPIRV/SPIRVBinaryUtils.h b/mlir/include/mlir/Target/SPIRV/SPIRVBinaryUtils.h index e46a576f1d48e..4a4116312981a 100644 --- a/mlir/include/mlir/Target/SPIRV/SPIRVBinaryUtils.h +++ b/mlir/include/mlir/Target/SPIRV/SPIRVBinaryUtils.h @@ -30,6 +30,13 @@ constexpr uint32_t kMagicNumber = 0x07230203; /// The serializer tool ID registered to the Khronos Group constexpr uint32_t kGeneratorNumber = 22; +/// Max number of words +/// https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#_universal_limits +constexpr uint32_t kMaxWordCount = 65535; + +/// Max number of words for literal +constexpr uint32_t kMaxLiteralWordCount = kMaxWordCount - 3; + /// Appends a SPRI-V module header to `header` with the given `version` and /// `idBound`. void appendModuleHeader(SmallVectorImpl &header, diff --git a/mlir/lib/Target/SPIRV/SPIRVBinaryUtils.cpp b/mlir/lib/Target/SPIRV/SPIRVBinaryUtils.cpp index 31205d8f408f1..0ec468d4c1665 100644 --- a/mlir/lib/Target/SPIRV/SPIRVBinaryUtils.cpp +++ b/mlir/lib/Target/SPIRV/SPIRVBinaryUtils.cpp @@ -13,6 +13,9 @@ #include "mlir/Target/SPIRV/SPIRVBinaryUtils.h" #include "mlir/Dialect/SPIRV/IR/SPIRVTypes.h" #include "llvm/Config/llvm-config.h" // for LLVM_VERSION_MAJOR +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "spirv-binary-utils" using namespace mlir; @@ -67,8 +70,19 @@ uint32_t spirv::getPrefixedOpcode(uint32_t wordCount, spirv::Opcode opcode) { void spirv::encodeStringLiteralInto(SmallVectorImpl &binary, StringRef literal) { // We need to encode the literal and the null termination. - auto encodingSize = literal.size() / 4 + 1; - auto bufferStartSize = binary.size(); + size_t encodingSize = literal.size() / 4 + 1; + size_t sizeOfDataToCopy = literal.size(); + if (encodingSize >= kMaxLiteralWordCount) { + // Reserve one word for the null termination. + encodingSize = kMaxLiteralWordCount - 1; + // Do not override the last word (null termination) when copying. + sizeOfDataToCopy = (encodingSize - 1) * 4; + LLVM_DEBUG(llvm::dbgs() + << "Truncating string literal to max size (" + << (kMaxLiteralWordCount - 1) << "): " << literal << "\n"); + } + size_t bufferStartSize = binary.size(); binary.resize(bufferStartSize + encodingSize, 0); - std::memcpy(binary.data() + bufferStartSize, literal.data(), literal.size()); + std::memcpy(binary.data() + bufferStartSize, literal.data(), + sizeOfDataToCopy); } From 76e14deb4a6967388a9bf84db2feeac17a30c786 Mon Sep 17 00:00:00 2001 From: Phoebe Wang Date: Wed, 11 Jun 2025 22:08:20 +0800 Subject: [PATCH 083/851] [X86][BreakFalseDeps] Using reverse order for undef register selection (#137569) BreakFalseDeps picks the best register for undef operands if instructions have false dependency. The problem is if the instruction is close to the beginning of the function, ReachingDefAnalysis is over optimism to the unused registers, which results in collision with registers just defined in the caller. This patch changes the selection of undef register in an reverse order, which reduces the probability of register collisions between caller and callee. It brings improvement in some of our internal benchmarks with negligible effect on other benchmarks. --- llvm/include/llvm/CodeGen/RegisterClassInfo.h | 8 +- .../include/llvm/CodeGen/TargetRegisterInfo.h | 7 +- llvm/include/llvm/Target/Target.td | 2 +- llvm/lib/CodeGen/BreakFalseDeps.cpp | 2 +- llvm/lib/CodeGen/RegisterClassInfo.cpp | 13 +- llvm/lib/Target/X86/X86RegisterInfo.td | 28 +- llvm/test/CodeGen/X86/avx-cvt.ll | 16 +- llvm/test/CodeGen/X86/avx512-cvt.ll | 220 +++++----- .../test/CodeGen/X86/avx512-regcall-NoMask.ll | 28 +- llvm/test/CodeGen/X86/avx512fp16-cvt.ll | 36 +- llvm/test/CodeGen/X86/avx512fp16-novl.ll | 40 +- llvm/test/CodeGen/X86/break-false-dep.ll | 26 +- llvm/test/CodeGen/X86/coalescer-commute1.ll | 2 +- .../CodeGen/X86/fast-isel-fptrunc-fpext.ll | 4 +- .../fast-isel-int-float-conversion-x86-64.ll | 12 +- .../X86/fast-isel-int-float-conversion.ll | 24 +- .../fast-isel-uint-float-conversion-x86-64.ll | 12 +- .../X86/fast-isel-uint-float-conversion.ll | 24 +- llvm/test/CodeGen/X86/fcmp-logic.ll | 12 +- .../X86/fold-int-pow2-with-fmul-or-fdiv.ll | 52 +-- llvm/test/CodeGen/X86/fold-load-unops.ll | 24 +- llvm/test/CodeGen/X86/fp-intrinsics.ll | 40 +- .../X86/fp-strict-scalar-inttofp-fp16.ll | 60 +-- .../CodeGen/X86/fp-strict-scalar-inttofp.ll | 76 ++-- .../X86/fp-strict-scalar-round-fp16.ll | 12 +- llvm/test/CodeGen/X86/ftrunc.ll | 6 +- llvm/test/CodeGen/X86/half.ll | 8 +- llvm/test/CodeGen/X86/isel-int-to-fp.ll | 48 +-- llvm/test/CodeGen/X86/pr34080.ll | 4 +- llvm/test/CodeGen/X86/pr37879.ll | 2 +- llvm/test/CodeGen/X86/pr38803.ll | 2 +- llvm/test/CodeGen/X86/rounding-ops.ll | 16 +- llvm/test/CodeGen/X86/scalar-int-to-fp.ll | 30 +- .../CodeGen/X86/select-narrow-int-to-fp.ll | 32 +- .../CodeGen/X86/split-extend-vector-inreg.ll | 44 +- llvm/test/CodeGen/X86/sse-cvttp2si.ll | 16 +- .../X86/sse2-intrinsics-x86-upgrade.ll | 8 +- .../test/CodeGen/X86/stack-folding-fp-avx1.ll | 34 +- .../CodeGen/X86/vec-strict-inttofp-128.ll | 40 +- .../CodeGen/X86/vec-strict-inttofp-256.ll | 144 +++---- .../CodeGen/X86/vec-strict-inttofp-512.ll | 64 +-- llvm/test/CodeGen/X86/vec_int_to_fp.ll | 402 +++++++++--------- .../X86/vector-constrained-fp-intrinsics.ll | 210 ++++----- llvm/utils/TableGen/RegisterInfoEmitter.cpp | 10 +- 44 files changed, 973 insertions(+), 927 deletions(-) diff --git a/llvm/include/llvm/CodeGen/RegisterClassInfo.h b/llvm/include/llvm/CodeGen/RegisterClassInfo.h index 3096f8851516e..078ae80915fed 100644 --- a/llvm/include/llvm/CodeGen/RegisterClassInfo.h +++ b/llvm/include/llvm/CodeGen/RegisterClassInfo.h @@ -50,6 +50,8 @@ class RegisterClassInfo { // entry is valid when its tag matches. unsigned Tag = 0; + bool Reverse = false; + const MachineFunction *MF = nullptr; const TargetRegisterInfo *TRI = nullptr; @@ -86,9 +88,11 @@ class RegisterClassInfo { public: LLVM_ABI RegisterClassInfo(); - /// runOnFunction - Prepare to answer questions about MF. This must be called + /// runOnFunction - Prepare to answer questions about MF. Rev indicates to + /// use reversed raw order when compute register order. This must be called /// before any other methods are used. - LLVM_ABI void runOnMachineFunction(const MachineFunction &MF); + LLVM_ABI void runOnMachineFunction(const MachineFunction &MF, + bool Rev = false); /// getNumAllocatableRegs - Returns the number of actually allocatable /// registers in RC in the current function. diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h index de5a6ecb548a4..8b9ed78a8e970 100644 --- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h +++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h @@ -68,7 +68,7 @@ class TargetRegisterClass { const bool CoveredBySubRegs; const unsigned *SuperClasses; const uint16_t SuperClassesSize; - ArrayRef (*OrderFunc)(const MachineFunction&); + ArrayRef (*OrderFunc)(const MachineFunction &, bool Rev); /// Return the register class ID number. unsigned getID() const { return MC->getID(); } @@ -199,8 +199,9 @@ class TargetRegisterClass { /// other criteria. /// /// By default, this method returns all registers in the class. - ArrayRef getRawAllocationOrder(const MachineFunction &MF) const { - return OrderFunc ? OrderFunc(MF) : getRegisters(); + ArrayRef getRawAllocationOrder(const MachineFunction &MF, + bool Rev = false) const { + return OrderFunc ? OrderFunc(MF, Rev) : getRegisters(); } /// Returns the combination of all lane masks of register in this class. diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td index e8b460aaf803b..ce9a2b2751968 100644 --- a/llvm/include/llvm/Target/Target.td +++ b/llvm/include/llvm/Target/Target.td @@ -314,7 +314,7 @@ class RegisterClass regTypes, int alignment, // to use in a given machine function. The code will be inserted in a // function like this: // - // static inline unsigned f(const MachineFunction &MF) { ... } + // static inline unsigned f(const MachineFunction &MF, bool Rev) { ... } // // The function should return 0 to select the default order defined by // MemberList, 1 to select the first AltOrders entry and so on. diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp index 05eed969a18af..7eef4a9d12b16 100644 --- a/llvm/lib/CodeGen/BreakFalseDeps.cpp +++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp @@ -285,7 +285,7 @@ bool BreakFalseDeps::runOnMachineFunction(MachineFunction &mf) { TRI = MF->getSubtarget().getRegisterInfo(); RDA = &getAnalysis(); - RegClassInfo.runOnMachineFunction(mf); + RegClassInfo.runOnMachineFunction(mf, /*Rev=*/true); LLVM_DEBUG(dbgs() << "********** BREAK FALSE DEPENDENCIES **********\n"); diff --git a/llvm/lib/CodeGen/RegisterClassInfo.cpp b/llvm/lib/CodeGen/RegisterClassInfo.cpp index 40fc35a16335f..8ead83302c337 100644 --- a/llvm/lib/CodeGen/RegisterClassInfo.cpp +++ b/llvm/lib/CodeGen/RegisterClassInfo.cpp @@ -39,14 +39,16 @@ StressRA("stress-regalloc", cl::Hidden, cl::init(0), cl::value_desc("N"), RegisterClassInfo::RegisterClassInfo() = default; -void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf) { +void RegisterClassInfo::runOnMachineFunction(const MachineFunction &mf, + bool Rev) { bool Update = false; MF = &mf; auto &STI = MF->getSubtarget(); // Allocate new array the first time we see a new target. - if (STI.getRegisterInfo() != TRI) { + if (STI.getRegisterInfo() != TRI || Reverse != Rev) { + Reverse = Rev; TRI = STI.getRegisterInfo(); RegClass.reset(new RCInfo[TRI->getNumRegClasses()]); Update = true; @@ -142,7 +144,12 @@ void RegisterClassInfo::compute(const TargetRegisterClass *RC) const { // FIXME: Once targets reserve registers instead of removing them from the // allocation order, we can simply use begin/end here. - ArrayRef RawOrder = RC->getRawAllocationOrder(*MF); + ArrayRef RawOrder = RC->getRawAllocationOrder(*MF, Reverse); + std::vector ReverseOrder; + if (Reverse) { + llvm::append_range(ReverseOrder, reverse(RawOrder)); + RawOrder = ArrayRef(ReverseOrder); + } for (unsigned PhysReg : RawOrder) { // Remove reserved registers from the allocation order. if (Reserved.test(PhysReg)) diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td index 3f9af5639a686..e9ca25d808a56 100644 --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -806,17 +806,37 @@ def VR512_0_15 : RegisterClass<"X86", [v16f32, v8f64, v64i8, v32i16, v16i32, v8i 512, (sequence "ZMM%u", 0, 15)>; // Scalar AVX-512 floating point registers. -def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)>; +def FR32X : RegisterClass<"X86", [f32], 32, (sequence "XMM%u", 0, 31)> { + let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))]; + let AltOrderSelect = [{ + return Rev; + }]; +} -def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)>; +def FR64X : RegisterClass<"X86", [f64], 64, (add FR32X)> { + let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))]; + let AltOrderSelect = [{ + return Rev; + }]; +} def FR16X : RegisterClass<"X86", [f16], 16, (add FR32X)> {let Size = 32;} // Extended VR128 and VR256 for AVX-512 instructions def VR128X : RegisterClass<"X86", [v4f32, v2f64, v8f16, v8bf16, v16i8, v8i16, v4i32, v2i64, f128], - 128, (add FR32X)>; + 128, (add FR32X)> { + let AltOrders = [(add (sequence "XMM%u", 16, 31), (sequence "XMM%u", 0, 15))]; + let AltOrderSelect = [{ + return Rev; + }]; +} def VR256X : RegisterClass<"X86", [v8f32, v4f64, v16f16, v16bf16, v32i8, v16i16, v8i32, v4i64], - 256, (sequence "YMM%u", 0, 31)>; + 256, (sequence "YMM%u", 0, 31)> { + let AltOrders = [(add (sequence "YMM%u", 16, 31), (sequence "YMM%u", 0, 15))]; + let AltOrderSelect = [{ + return Rev; + }]; +} // Mask registers def VK1 : RegisterClass<"X86", [v1i1], 16, (sequence "K%u", 0, 7)> {let Size = 16;} diff --git a/llvm/test/CodeGen/X86/avx-cvt.ll b/llvm/test/CodeGen/X86/avx-cvt.ll index 1bd25273ecd48..fb30044512fa5 100644 --- a/llvm/test/CodeGen/X86/avx-cvt.ll +++ b/llvm/test/CodeGen/X86/avx-cvt.ll @@ -108,7 +108,7 @@ define <2 x double> @fpext01(<2 x double> %a0, <4 x float> %a1) nounwind { define double @funcA(ptr nocapture %e) nounwind uwtable readonly ssp { ; CHECK-LABEL: funcA: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: vcvtsi2sdq (%rdi), %xmm15, %xmm0 ; CHECK-NEXT: retq %tmp1 = load i64, ptr %e, align 8 %conv = sitofp i64 %tmp1 to double @@ -118,7 +118,7 @@ define double @funcA(ptr nocapture %e) nounwind uwtable readonly ssp { define double @funcB(ptr nocapture %e) nounwind uwtable readonly ssp { ; CHECK-LABEL: funcB: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: vcvtsi2sdl (%rdi), %xmm15, %xmm0 ; CHECK-NEXT: retq %tmp1 = load i32, ptr %e, align 4 %conv = sitofp i32 %tmp1 to double @@ -128,7 +128,7 @@ define double @funcB(ptr nocapture %e) nounwind uwtable readonly ssp { define float @funcC(ptr nocapture %e) nounwind uwtable readonly ssp { ; CHECK-LABEL: funcC: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: vcvtsi2ssl (%rdi), %xmm15, %xmm0 ; CHECK-NEXT: retq %tmp1 = load i32, ptr %e, align 4 %conv = sitofp i32 %tmp1 to float @@ -138,7 +138,7 @@ define float @funcC(ptr nocapture %e) nounwind uwtable readonly ssp { define float @funcD(ptr nocapture %e) nounwind uwtable readonly ssp { ; CHECK-LABEL: funcD: ; CHECK: # %bb.0: -; CHECK-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: vcvtsi2ssq (%rdi), %xmm15, %xmm0 ; CHECK-NEXT: retq %tmp1 = load i64, ptr %e, align 8 %conv = sitofp i64 %tmp1 to float @@ -183,7 +183,7 @@ declare float @llvm.floor.f32(float %p) define float @floor_f32_load(ptr %aptr) optsize { ; CHECK-LABEL: floor_f32_load: ; CHECK: # %bb.0: -; CHECK-NEXT: vroundss $9, (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: vroundss $9, (%rdi), %xmm15, %xmm0 ; CHECK-NEXT: retq %a = load float, ptr %aptr %res = call float @llvm.floor.f32(float %a) @@ -193,7 +193,7 @@ define float @floor_f32_load(ptr %aptr) optsize { define float @floor_f32_load_pgso(ptr %aptr) !prof !14 { ; CHECK-LABEL: floor_f32_load_pgso: ; CHECK: # %bb.0: -; CHECK-NEXT: vroundss $9, (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: vroundss $9, (%rdi), %xmm15, %xmm0 ; CHECK-NEXT: retq %a = load float, ptr %aptr %res = call float @llvm.floor.f32(float %a) @@ -203,7 +203,7 @@ define float @floor_f32_load_pgso(ptr %aptr) !prof !14 { define double @nearbyint_f64_load(ptr %aptr) optsize { ; CHECK-LABEL: nearbyint_f64_load: ; CHECK: # %bb.0: -; CHECK-NEXT: vroundsd $12, (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: vroundsd $12, (%rdi), %xmm15, %xmm0 ; CHECK-NEXT: retq %a = load double, ptr %aptr %res = call double @llvm.nearbyint.f64(double %a) @@ -213,7 +213,7 @@ define double @nearbyint_f64_load(ptr %aptr) optsize { define double @nearbyint_f64_load_pgso(ptr %aptr) !prof !14 { ; CHECK-LABEL: nearbyint_f64_load_pgso: ; CHECK: # %bb.0: -; CHECK-NEXT: vroundsd $12, (%rdi), %xmm0, %xmm0 +; CHECK-NEXT: vroundsd $12, (%rdi), %xmm15, %xmm0 ; CHECK-NEXT: retq %a = load double, ptr %aptr %res = call double @llvm.nearbyint.f64(double %a) diff --git a/llvm/test/CodeGen/X86/avx512-cvt.ll b/llvm/test/CodeGen/X86/avx512-cvt.ll index a78d97782e6a3..3dd7b571b9215 100644 --- a/llvm/test/CodeGen/X86/avx512-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512-cvt.ll @@ -22,27 +22,27 @@ define <8 x double> @sltof864(<8 x i64> %a) { ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 @@ -66,14 +66,14 @@ define <4 x double> @slto4f64(<4 x i64> %a) { ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm1 ; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-NEXT: retq @@ -97,9 +97,9 @@ define <2 x double> @slto2f64(<2 x i64> %a) { ; NODQ-LABEL: slto2f64: ; NODQ: # %bb.0: ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; NODQ-NEXT: retq ; @@ -123,9 +123,9 @@ define <2 x float> @sltof2f32(<2 x i64> %a) { ; NODQ-LABEL: sltof2f32: ; NODQ: # %bb.0: ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; NODQ-NEXT: retq ; @@ -148,12 +148,12 @@ define <2 x float> @sltof2f32(<2 x i64> %a) { define <4 x float> @slto4f32_mem(ptr %a) { ; NODQ-LABEL: slto4f32_mem: ; NODQ: # %bb.0: -; NODQ-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0 -; NODQ-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1 +; NODQ-NEXT: vcvtsi2ssq 8(%rdi), %xmm15, %xmm0 +; NODQ-NEXT: vcvtsi2ssq (%rdi), %xmm15, %xmm1 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; NODQ-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1 +; NODQ-NEXT: vcvtsi2ssq 16(%rdi), %xmm15, %xmm1 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; NODQ-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1 +; NODQ-NEXT: vcvtsi2ssq 24(%rdi), %xmm15, %xmm1 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; NODQ-NEXT: retq ; @@ -246,16 +246,16 @@ define <4 x float> @slto4f32(<4 x i64> %a) { ; NODQ-LABEL: slto4f32: ; NODQ: # %bb.0: ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; NODQ-NEXT: vzeroupper ; NODQ-NEXT: retq @@ -281,16 +281,16 @@ define <4 x float> @ulto4f32(<4 x i64> %a) { ; NODQ-LABEL: ulto4f32: ; NODQ: # %bb.0: ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; NODQ-NEXT: vzeroupper ; NODQ-NEXT: retq @@ -316,16 +316,16 @@ define <4 x float> @ulto4f32_nneg(<4 x i64> %a) { ; NODQ-LABEL: ulto4f32_nneg: ; NODQ: # %bb.0: ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; NODQ-NEXT: vzeroupper ; NODQ-NEXT: retq @@ -864,7 +864,7 @@ define <2 x double> @f32tof64_inreg(<2 x double> %a0, <4 x float> %a1) nounwind define double @sltof64_load(ptr nocapture %e) { ; ALL-LABEL: sltof64_load: ; ALL: # %bb.0: # %entry -; ALL-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 +; ALL-NEXT: vcvtsi2sdq (%rdi), %xmm15, %xmm0 ; ALL-NEXT: retq entry: %tmp1 = load i64, ptr %e, align 8 @@ -875,7 +875,7 @@ entry: define double @sitof64_load(ptr %e) { ; ALL-LABEL: sitof64_load: ; ALL: # %bb.0: # %entry -; ALL-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 +; ALL-NEXT: vcvtsi2sdl (%rdi), %xmm15, %xmm0 ; ALL-NEXT: retq entry: %tmp1 = load i32, ptr %e, align 4 @@ -886,7 +886,7 @@ entry: define float @sitof32_load(ptr %e) { ; ALL-LABEL: sitof32_load: ; ALL: # %bb.0: # %entry -; ALL-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 +; ALL-NEXT: vcvtsi2ssl (%rdi), %xmm15, %xmm0 ; ALL-NEXT: retq entry: %tmp1 = load i32, ptr %e, align 4 @@ -897,7 +897,7 @@ entry: define float @sltof32_load(ptr %e) { ; ALL-LABEL: sltof32_load: ; ALL: # %bb.0: # %entry -; ALL-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 +; ALL-NEXT: vcvtsi2ssq (%rdi), %xmm15, %xmm0 ; ALL-NEXT: retq entry: %tmp1 = load i64, ptr %e, align 8 @@ -990,28 +990,28 @@ define <8 x float> @slto8f32(<8 x i64> %a) { ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-NEXT: retq @@ -1034,54 +1034,54 @@ define <16 x float> @slto16f32(<16 x i64> %a) { ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] ; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 ; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; NODQ-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm2 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] ; NODQ-NEXT: vextractf32x4 $3, %zmm0, %xmm3 ; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2ss %rax, %xmm5, %xmm0 +; NODQ-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 @@ -1109,27 +1109,27 @@ define <8 x double> @slto8f64(<8 x i64> %a) { ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 @@ -1153,53 +1153,53 @@ define <16 x double> @slto16f64(<16 x i64> %a) { ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm4 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm4[0] ; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm0, %zmm0 ; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm2 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm3 ; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm4 ; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm3[0],xmm4[0] ; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm4 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtsi2sd %rax, %xmm5, %xmm1 +; NODQ-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; NODQ-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm4[0] ; NODQ-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1 ; NODQ-NEXT: vinsertf64x4 $1, %ymm2, %zmm1, %zmm1 @@ -1225,28 +1225,28 @@ define <8 x float> @ulto8f32(<8 x i64> %a) { ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; NODQ-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; NODQ-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-NEXT: retq @@ -1269,54 +1269,54 @@ define <16 x float> @ulto16f32(<16 x i64> %a) { ; NODQ: # %bb.0: ; NODQ-NEXT: vextracti32x4 $2, %zmm1, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] ; NODQ-NEXT: vextracti32x4 $3, %zmm1, %xmm3 ; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm4, %xmm4 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm1, %xmm1 ; NODQ-NEXT: vmovq %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; NODQ-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm1 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; NODQ-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; NODQ-NEXT: vextractf32x4 $2, %zmm0, %xmm2 ; NODQ-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm2, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm2 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[2,3] ; NODQ-NEXT: vextractf32x4 $3, %zmm0, %xmm3 ; NODQ-NEXT: vmovq %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm4[0],xmm2[3] ; NODQ-NEXT: vpextrq $1, %xmm3, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[0] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm3 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm3 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; NODQ-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-NEXT: vmovq %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm4 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm4 ; NODQ-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; NODQ-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-NEXT: vcvtusi2ss %rax, %xmm5, %xmm0 +; NODQ-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; NODQ-NEXT: vinsertps {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[0] ; NODQ-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 @@ -1498,7 +1498,7 @@ define i32 @fptoui(float %a) nounwind { define float @uitof32(i32 %a) nounwind { ; ALL-LABEL: uitof32: ; ALL: # %bb.0: -; ALL-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; ALL-NEXT: vcvtusi2ss %edi, %xmm15, %xmm0 ; ALL-NEXT: retq %b = uitofp i32 %a to float ret float %b @@ -1507,7 +1507,7 @@ define float @uitof32(i32 %a) nounwind { define double @uitof64(i32 %a) nounwind { ; ALL-LABEL: uitof64: ; ALL: # %bb.0: -; ALL-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; ALL-NEXT: vcvtusi2sd %edi, %xmm15, %xmm0 ; ALL-NEXT: retq %b = uitofp i32 %a to double ret double %b diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll index 88c99a06326ab..a664cc7f17a5c 100644 --- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll +++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll @@ -1221,17 +1221,17 @@ define dso_local x86_regcallcc i32 @test_argRetMixTypes(double, float, i8 signex ; X32-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X32-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; X32-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; X32-NEXT: vcvtsi2sd %eax, %xmm2, %xmm1 +; X32-NEXT: vcvtsi2sd %eax, %xmm3, %xmm1 ; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; X32-NEXT: vcvtsi2sd %ecx, %xmm2, %xmm1 +; X32-NEXT: vcvtsi2sd %ecx, %xmm3, %xmm1 ; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; X32-NEXT: vmovd %edx, %xmm1 ; X32-NEXT: vpinsrd $1, %edi, %xmm1, %xmm1 ; X32-NEXT: vcvtqq2pd %ymm1, %ymm1 ; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; X32-NEXT: vcvtsi2sd %esi, %xmm2, %xmm1 +; X32-NEXT: vcvtsi2sd %esi, %xmm3, %xmm1 ; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; X32-NEXT: vcvtsi2sdl (%ebx), %xmm2, %xmm1 +; X32-NEXT: vcvtsi2sdl (%ebx), %xmm3, %xmm1 ; X32-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; X32-NEXT: vcvttsd2si %xmm0, %eax ; X32-NEXT: popl %ebx @@ -1242,15 +1242,15 @@ define dso_local x86_regcallcc i32 @test_argRetMixTypes(double, float, i8 signex ; WIN64: # %bb.0: ; WIN64-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; WIN64-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; WIN64-NEXT: vcvtsi2sd %eax, %xmm2, %xmm1 +; WIN64-NEXT: vcvtsi2sd %eax, %xmm7, %xmm1 ; WIN64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; WIN64-NEXT: vcvtsi2sd %ecx, %xmm2, %xmm1 +; WIN64-NEXT: vcvtsi2sd %ecx, %xmm7, %xmm1 ; WIN64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; WIN64-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm1 +; WIN64-NEXT: vcvtsi2sd %rdx, %xmm7, %xmm1 ; WIN64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; WIN64-NEXT: vcvtsi2sd %edi, %xmm2, %xmm1 +; WIN64-NEXT: vcvtsi2sd %edi, %xmm7, %xmm1 ; WIN64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; WIN64-NEXT: vcvtsi2sdl (%rsi), %xmm2, %xmm1 +; WIN64-NEXT: vcvtsi2sdl (%rsi), %xmm7, %xmm1 ; WIN64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; WIN64-NEXT: vcvttsd2si %xmm0, %eax ; WIN64-NEXT: retq @@ -1259,15 +1259,15 @@ define dso_local x86_regcallcc i32 @test_argRetMixTypes(double, float, i8 signex ; LINUXOSX64: # %bb.0: ; LINUXOSX64-NEXT: vcvtss2sd %xmm1, %xmm1, %xmm1 ; LINUXOSX64-NEXT: vaddsd %xmm0, %xmm1, %xmm0 -; LINUXOSX64-NEXT: vcvtsi2sd %eax, %xmm2, %xmm1 +; LINUXOSX64-NEXT: vcvtsi2sd %eax, %xmm7, %xmm1 ; LINUXOSX64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; LINUXOSX64-NEXT: vcvtsi2sd %ecx, %xmm2, %xmm1 +; LINUXOSX64-NEXT: vcvtsi2sd %ecx, %xmm7, %xmm1 ; LINUXOSX64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; LINUXOSX64-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm1 +; LINUXOSX64-NEXT: vcvtsi2sd %rdx, %xmm7, %xmm1 ; LINUXOSX64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; LINUXOSX64-NEXT: vcvtsi2sd %edi, %xmm2, %xmm1 +; LINUXOSX64-NEXT: vcvtsi2sd %edi, %xmm7, %xmm1 ; LINUXOSX64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 -; LINUXOSX64-NEXT: vcvtsi2sdl (%rsi), %xmm2, %xmm1 +; LINUXOSX64-NEXT: vcvtsi2sdl (%rsi), %xmm7, %xmm1 ; LINUXOSX64-NEXT: vaddsd %xmm1, %xmm0, %xmm0 ; LINUXOSX64-NEXT: vcvttsd2si %xmm0, %eax ; LINUXOSX64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll index 26abf51c76b23..3f6ddc6ecfd70 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-cvt.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-cvt.ll @@ -412,7 +412,7 @@ define double @extload_f16_f64(ptr %x) { define float @extload_f16_f32_optsize(ptr %x) optsize { ; X64-LABEL: extload_f16_f32_optsize: ; X64: # %bb.0: -; X64-NEXT: vcvtsh2ss (%rdi), %xmm0, %xmm0 +; X64-NEXT: vcvtsh2ss (%rdi), %xmm15, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: extload_f16_f32_optsize: @@ -420,7 +420,7 @@ define float @extload_f16_f32_optsize(ptr %x) optsize { ; X86-NEXT: pushl %eax ; X86-NEXT: .cfi_def_cfa_offset 8 ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vcvtsh2ss (%eax), %xmm0, %xmm0 +; X86-NEXT: vcvtsh2ss (%eax), %xmm7, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -434,7 +434,7 @@ define float @extload_f16_f32_optsize(ptr %x) optsize { define double @extload_f16_f64_optsize(ptr %x) optsize { ; X64-LABEL: extload_f16_f64_optsize: ; X64: # %bb.0: -; X64-NEXT: vcvtsh2sd (%rdi), %xmm0, %xmm0 +; X64-NEXT: vcvtsh2sd (%rdi), %xmm15, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: extload_f16_f64_optsize: @@ -447,7 +447,7 @@ define double @extload_f16_f64_optsize(ptr %x) optsize { ; X86-NEXT: andl $-8, %esp ; X86-NEXT: subl $8, %esp ; X86-NEXT: movl 8(%ebp), %eax -; X86-NEXT: vcvtsh2sd (%eax), %xmm0, %xmm0 +; X86-NEXT: vcvtsh2sd (%eax), %xmm7, %xmm0 ; X86-NEXT: vmovsd %xmm0, (%esp) ; X86-NEXT: fldl (%esp) ; X86-NEXT: movl %ebp, %esp @@ -559,13 +559,13 @@ define half @s8_to_half(i8 %x) { ; X64-LABEL: s8_to_half: ; X64: # %bb.0: ; X64-NEXT: movsbl %dil, %eax -; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: s8_to_half: ; X86: # %bb.0: ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: vcvtsi2sh %eax, %xmm7, %xmm0 ; X86-NEXT: retl %a = sitofp i8 %x to half ret half %a @@ -575,13 +575,13 @@ define half @s16_to_half(i16 %x) { ; X64-LABEL: s16_to_half: ; X64: # %bb.0: ; X64-NEXT: movswl %di, %eax -; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: s16_to_half: ; X86: # %bb.0: ; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: vcvtsi2sh %eax, %xmm7, %xmm0 ; X86-NEXT: retl %a = sitofp i16 %x to half ret half %a @@ -590,12 +590,12 @@ define half @s16_to_half(i16 %x) { define half @s32_to_half(i32 %x) { ; X64-LABEL: s32_to_half: ; X64: # %bb.0: -; X64-NEXT: vcvtsi2sh %edi, %xmm0, %xmm0 +; X64-NEXT: vcvtsi2sh %edi, %xmm31, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: s32_to_half: ; X86: # %bb.0: -; X86-NEXT: vcvtsi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vcvtsi2shl {{[0-9]+}}(%esp), %xmm7, %xmm0 ; X86-NEXT: retl %a = sitofp i32 %x to half ret half %a @@ -604,7 +604,7 @@ define half @s32_to_half(i32 %x) { define half @s64_to_half(i64 %x) { ; X64-LABEL: s64_to_half: ; X64: # %bb.0: -; X64-NEXT: vcvtsi2sh %rdi, %xmm0, %xmm0 +; X64-NEXT: vcvtsi2sh %rdi, %xmm31, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: s64_to_half: @@ -644,13 +644,13 @@ define half @u8_to_half(i8 %x) { ; X64-LABEL: u8_to_half: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: u8_to_half: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: vcvtsi2sh %eax, %xmm7, %xmm0 ; X86-NEXT: retl %a = uitofp i8 %x to half ret half %a @@ -660,13 +660,13 @@ define half @u16_to_half(i16 %x) { ; X64-LABEL: u16_to_half: ; X64: # %bb.0: ; X64-NEXT: movzwl %di, %eax -; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: u16_to_half: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: vcvtsi2sh %eax, %xmm7, %xmm0 ; X86-NEXT: retl %a = uitofp i16 %x to half ret half %a @@ -675,12 +675,12 @@ define half @u16_to_half(i16 %x) { define half @u32_to_half(i32 %x) { ; X64-LABEL: u32_to_half: ; X64: # %bb.0: -; X64-NEXT: vcvtusi2sh %edi, %xmm0, %xmm0 +; X64-NEXT: vcvtusi2sh %edi, %xmm31, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: u32_to_half: ; X86: # %bb.0: -; X86-NEXT: vcvtusi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vcvtusi2shl {{[0-9]+}}(%esp), %xmm7, %xmm0 ; X86-NEXT: retl %a = uitofp i32 %x to half ret half %a @@ -689,7 +689,7 @@ define half @u32_to_half(i32 %x) { define half @u64_to_half(i64 %x) { ; X64-LABEL: u64_to_half: ; X64: # %bb.0: -; X64-NEXT: vcvtusi2sh %rdi, %xmm0, %xmm0 +; X64-NEXT: vcvtusi2sh %rdi, %xmm31, %xmm0 ; X64-NEXT: retq ; ; X86-LABEL: u64_to_half: diff --git a/llvm/test/CodeGen/X86/avx512fp16-novl.ll b/llvm/test/CodeGen/X86/avx512fp16-novl.ll index 1c4b7316c283c..d17cacc0e1ad7 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-novl.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-novl.ll @@ -16,14 +16,14 @@ define <4 x half> @vector_sint32ToHalf(<4 x i32> %int32) { ; CHECK-LABEL: vector_sint32ToHalf: ; CHECK: # %bb.0: ; CHECK-NEXT: vextractps $3, %xmm0, %eax -; CHECK-NEXT: vcvtsi2sh %eax, %xmm1, %xmm1 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm1 ; CHECK-NEXT: vextractps $2, %xmm0, %eax -; CHECK-NEXT: vcvtsi2sh %eax, %xmm2, %xmm2 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm2 ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; CHECK-NEXT: vextractps $1, %xmm0, %eax -; CHECK-NEXT: vcvtsi2sh %eax, %xmm3, %xmm2 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm2 ; CHECK-NEXT: vmovd %xmm0, %eax -; CHECK-NEXT: vcvtsi2sh %eax, %xmm3, %xmm0 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; CHECK-NEXT: retq @@ -36,32 +36,32 @@ define <8 x half> @vector_sint16ToHalf(<8 x i16> %int16) { ; CHECK: # %bb.0: ; CHECK-NEXT: vpextrw $7, %xmm0, %eax ; CHECK-NEXT: cwtl -; CHECK-NEXT: vcvtsi2sh %eax, %xmm1, %xmm1 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm1 ; CHECK-NEXT: vpextrw $6, %xmm0, %eax ; CHECK-NEXT: cwtl -; CHECK-NEXT: vcvtsi2sh %eax, %xmm2, %xmm2 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm2 ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; CHECK-NEXT: vpextrw $5, %xmm0, %eax ; CHECK-NEXT: cwtl -; CHECK-NEXT: vcvtsi2sh %eax, %xmm3, %xmm2 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm2 ; CHECK-NEXT: vpextrw $4, %xmm0, %eax ; CHECK-NEXT: cwtl -; CHECK-NEXT: vcvtsi2sh %eax, %xmm3, %xmm3 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm3 ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-NEXT: vpextrw $3, %xmm0, %eax ; CHECK-NEXT: cwtl -; CHECK-NEXT: vcvtsi2sh %eax, %xmm4, %xmm2 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm2 ; CHECK-NEXT: vpextrw $2, %xmm0, %eax ; CHECK-NEXT: cwtl -; CHECK-NEXT: vcvtsi2sh %eax, %xmm4, %xmm3 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm3 ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; CHECK-NEXT: vpextrw $1, %xmm0, %eax ; CHECK-NEXT: cwtl -; CHECK-NEXT: vcvtsi2sh %eax, %xmm4, %xmm3 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm3 ; CHECK-NEXT: vmovw %xmm0, %eax ; CHECK-NEXT: cwtl -; CHECK-NEXT: vcvtsi2sh %eax, %xmm4, %xmm0 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] @@ -97,25 +97,25 @@ define <8 x half> @vector_uint16ToHalf(<8 x i16> %int16) { ; CHECK-LABEL: vector_uint16ToHalf: ; CHECK: # %bb.0: ; CHECK-NEXT: vpextrw $7, %xmm0, %eax -; CHECK-NEXT: vcvtsi2sh %eax, %xmm1, %xmm1 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm1 ; CHECK-NEXT: vpextrw $6, %xmm0, %eax -; CHECK-NEXT: vcvtsi2sh %eax, %xmm2, %xmm2 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm2 ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] ; CHECK-NEXT: vpextrw $5, %xmm0, %eax -; CHECK-NEXT: vcvtsi2sh %eax, %xmm3, %xmm2 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm2 ; CHECK-NEXT: vpextrw $4, %xmm0, %eax -; CHECK-NEXT: vcvtsi2sh %eax, %xmm3, %xmm3 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm3 ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; CHECK-NEXT: vpextrw $3, %xmm0, %eax -; CHECK-NEXT: vcvtsi2sh %eax, %xmm4, %xmm2 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm2 ; CHECK-NEXT: vpextrw $2, %xmm0, %eax -; CHECK-NEXT: vcvtsi2sh %eax, %xmm4, %xmm3 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm3 ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] ; CHECK-NEXT: vpextrw $1, %xmm0, %eax -; CHECK-NEXT: vcvtsi2sh %eax, %xmm4, %xmm3 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm3 ; CHECK-NEXT: vpextrw $0, %xmm0, %eax -; CHECK-NEXT: vcvtsi2sh %eax, %xmm4, %xmm0 +; CHECK-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; CHECK-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; CHECK-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] ; CHECK-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] diff --git a/llvm/test/CodeGen/X86/break-false-dep.ll b/llvm/test/CodeGen/X86/break-false-dep.ll index 5acbccf41c5d3..6943622fac7f2 100644 --- a/llvm/test/CodeGen/X86/break-false-dep.ll +++ b/llvm/test/CodeGen/X86/break-false-dep.ll @@ -36,7 +36,7 @@ define dso_local float @t2(ptr nocapture %x) nounwind readonly ssp optsize { ; ; AVX-LABEL: t2: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsd2ss (%rcx), %xmm0, %xmm0 +; AVX-NEXT: vcvtsd2ss (%rcx), %xmm5, %xmm0 ; AVX-NEXT: retq entry: %0 = load double, ptr %x, align 8 @@ -93,7 +93,7 @@ define dso_local float @squirtf_size(ptr %x) nounwind optsize { ; ; AVX-LABEL: squirtf_size: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vsqrtss (%rcx), %xmm0, %xmm0 +; AVX-NEXT: vsqrtss (%rcx), %xmm5, %xmm0 ; AVX-NEXT: retq entry: %z = load float, ptr %x @@ -114,7 +114,7 @@ define dso_local double @squirt_size(ptr %x) nounwind optsize { ; ; AVX-LABEL: squirt_size: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vsqrtsd (%rcx), %xmm0, %xmm0 +; AVX-NEXT: vsqrtsd (%rcx), %xmm5, %xmm0 ; AVX-NEXT: retq entry: %z = load double, ptr %x @@ -199,8 +199,8 @@ define dso_local float @loopdep1(i32 %m) nounwind uwtable readnone ssp { ; AVX1-NEXT: .p2align 4 ; AVX1-NEXT: .LBB6_3: # %for.body ; AVX1-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX1-NEXT: vcvtsi2ss %eax, %xmm4, %xmm2 -; AVX1-NEXT: vcvtsi2ss %ecx, %xmm4, %xmm3 +; AVX1-NEXT: vcvtsi2ss %eax, %xmm5, %xmm2 +; AVX1-NEXT: vcvtsi2ss %ecx, %xmm5, %xmm3 ; AVX1-NEXT: vaddss %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: incl %eax @@ -226,9 +226,9 @@ define dso_local float @loopdep1(i32 %m) nounwind uwtable readnone ssp { ; AVX512VL-NEXT: .p2align 4 ; AVX512VL-NEXT: .LBB6_3: # %for.body ; AVX512VL-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX512VL-NEXT: vcvtsi2ss %eax, %xmm3, %xmm2 +; AVX512VL-NEXT: vcvtsi2ss %eax, %xmm5, %xmm2 ; AVX512VL-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; AVX512VL-NEXT: vcvtsi2ss %ecx, %xmm3, %xmm2 +; AVX512VL-NEXT: vcvtsi2ss %ecx, %xmm5, %xmm2 ; AVX512VL-NEXT: vaddss %xmm2, %xmm1, %xmm1 ; AVX512VL-NEXT: incl %eax ; AVX512VL-NEXT: decl %ecx @@ -358,8 +358,8 @@ define i64 @loopdep2(ptr nocapture %x, ptr nocapture %y) nounwind { ; AVX-NEXT: .p2align 4 ; AVX-NEXT: .LBB7_1: # %loop ; AVX-NEXT: # =>This Inner Loop Header: Depth=1 -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vcvtsi2sd %rcx, %xmm1, %xmm0 +; AVX-NEXT: vxorps %xmm5, %xmm5, %xmm5 +; AVX-NEXT: vcvtsi2sd %rcx, %xmm5, %xmm0 ; AVX-NEXT: vmovsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; AVX-NEXT: #APP ; AVX-NEXT: #NO_APP @@ -566,8 +566,8 @@ define dso_local void @loopdep3() { ; AVX-NEXT: .LBB8_2: # %for.body3 ; AVX-NEXT: # Parent Loop BB8_1 Depth=1 ; AVX-NEXT: # => This Inner Loop Header: Depth=2 -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2sdl (%r11), %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm5, %xmm5, %xmm5 +; AVX-NEXT: vcvtsi2sdl (%r11), %xmm5, %xmm0 ; AVX-NEXT: vmulsd (%rsi,%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmulsd (%rsi,%r8), %xmm0, %xmm0 ; AVX-NEXT: vmulsd (%rsi,%r9), %xmm0, %xmm0 @@ -761,8 +761,8 @@ define dso_local double @inlineasmdep(i64 %arg) { ; AVX-NEXT: #NO_APP ; AVX-NEXT: #APP ; AVX-NEXT: #NO_APP -; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0 +; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vcvtsi2sd %rcx, %xmm3, %xmm0 ; AVX-NEXT: vmovaps (%rsp), %xmm6 # 16-byte Reload ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload ; AVX-NEXT: vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload diff --git a/llvm/test/CodeGen/X86/coalescer-commute1.ll b/llvm/test/CodeGen/X86/coalescer-commute1.ll index 28502782cf642..f4decb7e2e0c5 100644 --- a/llvm/test/CodeGen/X86/coalescer-commute1.ll +++ b/llvm/test/CodeGen/X86/coalescer-commute1.ll @@ -16,7 +16,7 @@ define void @runcont(ptr %source) nounwind { ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: LBB0_1: ## %bb ; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vcvtsi2ssl (%eax,%edx,4), %xmm2, %xmm1 +; CHECK-NEXT: vcvtsi2ssl (%eax,%edx,4), %xmm7, %xmm1 ; CHECK-NEXT: vaddss %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: incl %edx ; CHECK-NEXT: cmpl %edx, %ecx diff --git a/llvm/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll b/llvm/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll index cfca56d35998e..00aa9cd8a27f3 100644 --- a/llvm/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll +++ b/llvm/test/CodeGen/X86/fast-isel-fptrunc-fpext.ll @@ -78,7 +78,7 @@ define double @single_to_double_rm_optsize(ptr %x) optsize { ; ; AVX-LABEL: single_to_double_rm_optsize: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtss2sd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vcvtss2sd (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq entry: %0 = load float, ptr %x, align 4 @@ -112,7 +112,7 @@ define float @double_to_single_rm_optsize(ptr %x) optsize { ; ; AVX-LABEL: double_to_single_rm_optsize: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsd2ss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vcvtsd2ss (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq entry: %0 = load double, ptr %x, align 8 diff --git a/llvm/test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll b/llvm/test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll index 42d65f7cd64b6..5bf08f1c523d2 100644 --- a/llvm/test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll +++ b/llvm/test/CodeGen/X86/fast-isel-int-float-conversion-x86-64.ll @@ -12,7 +12,7 @@ define double @long_to_double_rr(i64 %a) { ; ; AVX-LABEL: long_to_double_rr: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sd %rdi, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %0 = sitofp i64 %a to double @@ -27,7 +27,7 @@ define double @long_to_double_rm(ptr %a) { ; ; AVX-LABEL: long_to_double_rm: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sdq (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq entry: %0 = load i64, ptr %a @@ -43,7 +43,7 @@ define double @long_to_double_rm_optsize(ptr %a) optsize { ; ; AVX-LABEL: long_to_double_rm_optsize: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2sdq (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sdq (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq entry: %0 = load i64, ptr %a @@ -59,7 +59,7 @@ define float @long_to_float_rr(i64 %a) { ; ; AVX-LABEL: long_to_float_rr: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %rdi, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %0 = sitofp i64 %a to float @@ -74,7 +74,7 @@ define float @long_to_float_rm(ptr %a) { ; ; AVX-LABEL: long_to_float_rm: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ssq (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq entry: %0 = load i64, ptr %a @@ -90,7 +90,7 @@ define float @long_to_float_rm_optsize(ptr %a) optsize { ; ; AVX-LABEL: long_to_float_rm_optsize: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2ssq (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ssq (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq entry: %0 = load i64, ptr %a diff --git a/llvm/test/CodeGen/X86/fast-isel-int-float-conversion.ll b/llvm/test/CodeGen/X86/fast-isel-int-float-conversion.ll index 36daba63f08bc..b39d9a7a3a6d0 100644 --- a/llvm/test/CodeGen/X86/fast-isel-int-float-conversion.ll +++ b/llvm/test/CodeGen/X86/fast-isel-int-float-conversion.ll @@ -15,7 +15,7 @@ define double @int_to_double_rr(i32 %a) { ; ; AVX-LABEL: int_to_double_rr: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sd %edi, %xmm15, %xmm0 ; AVX-NEXT: retq ; ; SSE2_X86-LABEL: int_to_double_rr: @@ -44,7 +44,7 @@ define double @int_to_double_rr(i32 %a) { ; AVX_X86-NEXT: .cfi_def_cfa_register %ebp ; AVX_X86-NEXT: andl $-8, %esp ; AVX_X86-NEXT: subl $8, %esp -; AVX_X86-NEXT: vcvtsi2sdl 8(%ebp), %xmm0, %xmm0 +; AVX_X86-NEXT: vcvtsi2sdl 8(%ebp), %xmm7, %xmm0 ; AVX_X86-NEXT: vmovsd %xmm0, (%esp) ; AVX_X86-NEXT: fldl (%esp) ; AVX_X86-NEXT: movl %ebp, %esp @@ -64,7 +64,7 @@ define double @int_to_double_rm(ptr %a) { ; ; AVX-LABEL: int_to_double_rm: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sdl (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq ; ; SSE2_X86-LABEL: int_to_double_rm: @@ -95,7 +95,7 @@ define double @int_to_double_rm(ptr %a) { ; AVX_X86-NEXT: andl $-8, %esp ; AVX_X86-NEXT: subl $8, %esp ; AVX_X86-NEXT: movl 8(%ebp), %eax -; AVX_X86-NEXT: vcvtsi2sdl (%eax), %xmm0, %xmm0 +; AVX_X86-NEXT: vcvtsi2sdl (%eax), %xmm7, %xmm0 ; AVX_X86-NEXT: vmovsd %xmm0, (%esp) ; AVX_X86-NEXT: fldl (%esp) ; AVX_X86-NEXT: movl %ebp, %esp @@ -116,7 +116,7 @@ define double @int_to_double_rm_optsize(ptr %a) optsize { ; ; AVX-LABEL: int_to_double_rm_optsize: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2sdl (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sdl (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq ; ; SSE2_X86-LABEL: int_to_double_rm_optsize: @@ -147,7 +147,7 @@ define double @int_to_double_rm_optsize(ptr %a) optsize { ; AVX_X86-NEXT: andl $-8, %esp ; AVX_X86-NEXT: subl $8, %esp ; AVX_X86-NEXT: movl 8(%ebp), %eax -; AVX_X86-NEXT: vcvtsi2sdl (%eax), %xmm0, %xmm0 +; AVX_X86-NEXT: vcvtsi2sdl (%eax), %xmm7, %xmm0 ; AVX_X86-NEXT: vmovsd %xmm0, (%esp) ; AVX_X86-NEXT: fldl (%esp) ; AVX_X86-NEXT: movl %ebp, %esp @@ -168,7 +168,7 @@ define float @int_to_float_rr(i32 %a) { ; ; AVX-LABEL: int_to_float_rr: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0 ; AVX-NEXT: retq ; ; SSE2_X86-LABEL: int_to_float_rr: @@ -186,7 +186,7 @@ define float @int_to_float_rr(i32 %a) { ; AVX_X86: # %bb.0: # %entry ; AVX_X86-NEXT: pushl %eax ; AVX_X86-NEXT: .cfi_def_cfa_offset 8 -; AVX_X86-NEXT: vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX_X86-NEXT: vcvtsi2ssl {{[0-9]+}}(%esp), %xmm7, %xmm0 ; AVX_X86-NEXT: vmovss %xmm0, (%esp) ; AVX_X86-NEXT: flds (%esp) ; AVX_X86-NEXT: popl %eax @@ -205,7 +205,7 @@ define float @int_to_float_rm(ptr %a) { ; ; AVX-LABEL: int_to_float_rm: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ssl (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq ; ; SSE2_X86-LABEL: int_to_float_rm: @@ -225,7 +225,7 @@ define float @int_to_float_rm(ptr %a) { ; AVX_X86-NEXT: pushl %eax ; AVX_X86-NEXT: .cfi_def_cfa_offset 8 ; AVX_X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX_X86-NEXT: vcvtsi2ssl (%eax), %xmm0, %xmm0 +; AVX_X86-NEXT: vcvtsi2ssl (%eax), %xmm7, %xmm0 ; AVX_X86-NEXT: vmovss %xmm0, (%esp) ; AVX_X86-NEXT: flds (%esp) ; AVX_X86-NEXT: popl %eax @@ -245,7 +245,7 @@ define float @int_to_float_rm_optsize(ptr %a) optsize { ; ; AVX-LABEL: int_to_float_rm_optsize: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2ssl (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ssl (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq ; ; SSE2_X86-LABEL: int_to_float_rm_optsize: @@ -265,7 +265,7 @@ define float @int_to_float_rm_optsize(ptr %a) optsize { ; AVX_X86-NEXT: pushl %eax ; AVX_X86-NEXT: .cfi_def_cfa_offset 8 ; AVX_X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX_X86-NEXT: vcvtsi2ssl (%eax), %xmm0, %xmm0 +; AVX_X86-NEXT: vcvtsi2ssl (%eax), %xmm7, %xmm0 ; AVX_X86-NEXT: vmovss %xmm0, (%esp) ; AVX_X86-NEXT: flds (%esp) ; AVX_X86-NEXT: popl %eax diff --git a/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion-x86-64.ll b/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion-x86-64.ll index d05bcfe3fd1e7..77ef9ee5ad2b7 100644 --- a/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion-x86-64.ll +++ b/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion-x86-64.ll @@ -5,7 +5,7 @@ define double @long_to_double_rr(i64 %a) { ; ALL-LABEL: long_to_double_rr: ; ALL: # %bb.0: # %entry -; ALL-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 +; ALL-NEXT: vcvtusi2sd %rdi, %xmm15, %xmm0 ; ALL-NEXT: retq entry: %0 = uitofp i64 %a to double @@ -15,7 +15,7 @@ entry: define double @long_to_double_rm(ptr %a) { ; ALL-LABEL: long_to_double_rm: ; ALL: # %bb.0: # %entry -; ALL-NEXT: vcvtusi2sdq (%rdi), %xmm0, %xmm0 +; ALL-NEXT: vcvtusi2sdq (%rdi), %xmm15, %xmm0 ; ALL-NEXT: retq entry: %0 = load i64, ptr %a @@ -26,7 +26,7 @@ entry: define double @long_to_double_rm_optsize(ptr %a) optsize { ; ALL-LABEL: long_to_double_rm_optsize: ; ALL: # %bb.0: # %entry -; ALL-NEXT: vcvtusi2sdq (%rdi), %xmm0, %xmm0 +; ALL-NEXT: vcvtusi2sdq (%rdi), %xmm15, %xmm0 ; ALL-NEXT: retq entry: %0 = load i64, ptr %a @@ -37,7 +37,7 @@ entry: define float @long_to_float_rr(i64 %a) { ; ALL-LABEL: long_to_float_rr: ; ALL: # %bb.0: # %entry -; ALL-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 +; ALL-NEXT: vcvtusi2ss %rdi, %xmm15, %xmm0 ; ALL-NEXT: retq entry: %0 = uitofp i64 %a to float @@ -47,7 +47,7 @@ entry: define float @long_to_float_rm(ptr %a) { ; ALL-LABEL: long_to_float_rm: ; ALL: # %bb.0: # %entry -; ALL-NEXT: vcvtusi2ssq (%rdi), %xmm0, %xmm0 +; ALL-NEXT: vcvtusi2ssq (%rdi), %xmm15, %xmm0 ; ALL-NEXT: retq entry: %0 = load i64, ptr %a @@ -58,7 +58,7 @@ entry: define float @long_to_float_rm_optsize(ptr %a) optsize { ; ALL-LABEL: long_to_float_rm_optsize: ; ALL: # %bb.0: # %entry -; ALL-NEXT: vcvtusi2ssq (%rdi), %xmm0, %xmm0 +; ALL-NEXT: vcvtusi2ssq (%rdi), %xmm15, %xmm0 ; ALL-NEXT: retq entry: %0 = load i64, ptr %a diff --git a/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion.ll b/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion.ll index b7f9af6165a9c..de5765baeb9d5 100644 --- a/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion.ll +++ b/llvm/test/CodeGen/X86/fast-isel-uint-float-conversion.ll @@ -6,7 +6,7 @@ define double @int_to_double_rr(i32 %a) { ; AVX-LABEL: int_to_double_rr: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; AVX-NEXT: vcvtusi2sd %edi, %xmm15, %xmm0 ; AVX-NEXT: retq ; ; AVX_X86-LABEL: int_to_double_rr: @@ -18,7 +18,7 @@ define double @int_to_double_rr(i32 %a) { ; AVX_X86-NEXT: .cfi_def_cfa_register %ebp ; AVX_X86-NEXT: andl $-8, %esp ; AVX_X86-NEXT: subl $8, %esp -; AVX_X86-NEXT: vcvtusi2sdl 8(%ebp), %xmm0, %xmm0 +; AVX_X86-NEXT: vcvtusi2sdl 8(%ebp), %xmm7, %xmm0 ; AVX_X86-NEXT: vmovsd %xmm0, (%esp) ; AVX_X86-NEXT: fldl (%esp) ; AVX_X86-NEXT: movl %ebp, %esp @@ -33,7 +33,7 @@ entry: define double @int_to_double_rm(ptr %a) { ; AVX-LABEL: int_to_double_rm: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtusi2sdl (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vcvtusi2sdl (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq ; ; AVX_X86-LABEL: int_to_double_rm: @@ -46,7 +46,7 @@ define double @int_to_double_rm(ptr %a) { ; AVX_X86-NEXT: andl $-8, %esp ; AVX_X86-NEXT: subl $8, %esp ; AVX_X86-NEXT: movl 8(%ebp), %eax -; AVX_X86-NEXT: vcvtusi2sdl (%eax), %xmm0, %xmm0 +; AVX_X86-NEXT: vcvtusi2sdl (%eax), %xmm7, %xmm0 ; AVX_X86-NEXT: vmovsd %xmm0, (%esp) ; AVX_X86-NEXT: fldl (%esp) ; AVX_X86-NEXT: movl %ebp, %esp @@ -62,7 +62,7 @@ entry: define double @int_to_double_rm_optsize(ptr %a) optsize { ; AVX-LABEL: int_to_double_rm_optsize: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtusi2sdl (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vcvtusi2sdl (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq ; ; AVX_X86-LABEL: int_to_double_rm_optsize: @@ -75,7 +75,7 @@ define double @int_to_double_rm_optsize(ptr %a) optsize { ; AVX_X86-NEXT: andl $-8, %esp ; AVX_X86-NEXT: subl $8, %esp ; AVX_X86-NEXT: movl 8(%ebp), %eax -; AVX_X86-NEXT: vcvtusi2sdl (%eax), %xmm0, %xmm0 +; AVX_X86-NEXT: vcvtusi2sdl (%eax), %xmm7, %xmm0 ; AVX_X86-NEXT: vmovsd %xmm0, (%esp) ; AVX_X86-NEXT: fldl (%esp) ; AVX_X86-NEXT: movl %ebp, %esp @@ -91,14 +91,14 @@ entry: define float @int_to_float_rr(i32 %a) { ; AVX-LABEL: int_to_float_rr: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: vcvtusi2ss %edi, %xmm15, %xmm0 ; AVX-NEXT: retq ; ; AVX_X86-LABEL: int_to_float_rr: ; AVX_X86: # %bb.0: # %entry ; AVX_X86-NEXT: pushl %eax ; AVX_X86-NEXT: .cfi_def_cfa_offset 8 -; AVX_X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX_X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm7, %xmm0 ; AVX_X86-NEXT: vmovss %xmm0, (%esp) ; AVX_X86-NEXT: flds (%esp) ; AVX_X86-NEXT: popl %eax @@ -112,7 +112,7 @@ entry: define float @int_to_float_rm(ptr %a) { ; AVX-LABEL: int_to_float_rm: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtusi2ssl (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vcvtusi2ssl (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq ; ; AVX_X86-LABEL: int_to_float_rm: @@ -120,7 +120,7 @@ define float @int_to_float_rm(ptr %a) { ; AVX_X86-NEXT: pushl %eax ; AVX_X86-NEXT: .cfi_def_cfa_offset 8 ; AVX_X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX_X86-NEXT: vcvtusi2ssl (%eax), %xmm0, %xmm0 +; AVX_X86-NEXT: vcvtusi2ssl (%eax), %xmm7, %xmm0 ; AVX_X86-NEXT: vmovss %xmm0, (%esp) ; AVX_X86-NEXT: flds (%esp) ; AVX_X86-NEXT: popl %eax @@ -135,7 +135,7 @@ entry: define float @int_to_float_rm_optsize(ptr %a) optsize { ; AVX-LABEL: int_to_float_rm_optsize: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtusi2ssl (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vcvtusi2ssl (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq ; ; AVX_X86-LABEL: int_to_float_rm_optsize: @@ -143,7 +143,7 @@ define float @int_to_float_rm_optsize(ptr %a) optsize { ; AVX_X86-NEXT: pushl %eax ; AVX_X86-NEXT: .cfi_def_cfa_offset 8 ; AVX_X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX_X86-NEXT: vcvtusi2ssl (%eax), %xmm0, %xmm0 +; AVX_X86-NEXT: vcvtusi2ssl (%eax), %xmm7, %xmm0 ; AVX_X86-NEXT: vmovss %xmm0, (%esp) ; AVX_X86-NEXT: flds (%esp) ; AVX_X86-NEXT: popl %eax diff --git a/llvm/test/CodeGen/X86/fcmp-logic.ll b/llvm/test/CodeGen/X86/fcmp-logic.ll index 7b806bca43c2e..98fa725b2ea3a 100644 --- a/llvm/test/CodeGen/X86/fcmp-logic.ll +++ b/llvm/test/CodeGen/X86/fcmp-logic.ll @@ -399,11 +399,11 @@ define i1 @PR140534(i32 %a0, i32 %a1, i32 %a2) { ; AVX1-LABEL: PR140534: ; AVX1: # %bb.0: ; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX1-NEXT: movl %esi, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX1-NEXT: movl %edx, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX1-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX1-NEXT: vcmpltsd %xmm2, %xmm1, %xmm2 ; AVX1-NEXT: vcmpltsd %xmm0, %xmm1, %xmm0 @@ -414,9 +414,9 @@ define i1 @PR140534(i32 %a0, i32 %a1, i32 %a2) { ; ; AVX512-LABEL: PR140534: ; AVX512: # %bb.0: -; AVX512-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 -; AVX512-NEXT: vcvtusi2sd %esi, %xmm1, %xmm1 -; AVX512-NEXT: vcvtusi2sd %edx, %xmm2, %xmm2 +; AVX512-NEXT: vcvtusi2sd %edi, %xmm15, %xmm0 +; AVX512-NEXT: vcvtusi2sd %esi, %xmm15, %xmm1 +; AVX512-NEXT: vcvtusi2sd %edx, %xmm15, %xmm2 ; AVX512-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512-NEXT: vcmpltsd %xmm2, %xmm1, %k0 ; AVX512-NEXT: vcmpltsd %xmm0, %xmm1, %k1 diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll index 5519d9b787b7f..d59b12c6d1231 100644 --- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll +++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll @@ -887,14 +887,14 @@ define double @fmul_pow_shl_cnt_fail_maybe_non_pow2(i64 %v, i64 %cnt) nounwind { ; CHECK-NO-FASTFMA-NEXT: movq %rsi, %rcx ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rdi -; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rdi, %xmm15, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; ; CHECK-FMA-LABEL: fmul_pow_shl_cnt_fail_maybe_non_pow2: ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: shlxq %rsi, %rdi, %rax -; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 ; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq %shl = shl nuw i64 %v, %cnt @@ -927,9 +927,9 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou ; CHECK-AVX2-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] ; CHECK-AVX2-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: vpextrq $1, %xmm0, %rax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm1 +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; CHECK-AVX2-NEXT: vmovq %xmm0, %rax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; CHECK-AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; CHECK-AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1] ; CHECK-AVX2-NEXT: vmulps %xmm1, %xmm0, %xmm0 @@ -940,9 +940,9 @@ define <2 x float> @fmul_pow_shl_cnt_vec_fail_expensive_cast(<2 x i64> %cnt) nou ; CHECK-NO-FASTFMA-NEXT: vpmovsxbq {{.*#+}} xmm1 = [2,2] ; CHECK-NO-FASTFMA-NEXT: vpsllvq %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vpextrq $1, %xmm0, %rax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm2, %xmm1 +; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; CHECK-NO-FASTFMA-NEXT: vmovq %xmm0, %rax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; CHECK-NO-FASTFMA-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1] ; CHECK-NO-FASTFMA-NEXT: vmulps %xmm1, %xmm0, %xmm0 @@ -1108,13 +1108,13 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind { ; CHECK-AVX2-NEXT: vpsllvd %ymm0, %ymm1, %ymm0 ; CHECK-AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill ; CHECK-AVX2-NEXT: vpextrw $2, %xmm0, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 +; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-AVX2-NEXT: vzeroupper ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: vmovss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill ; CHECK-AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload ; CHECK-AVX2-NEXT: vpextrw $0, %xmm0, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm2, %xmm0 +; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-AVX2-NEXT: vzeroupper ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT @@ -1201,7 +1201,7 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind { ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq ; @@ -1209,7 +1209,7 @@ define double @fmul_pow_shl_cnt_fail_maybe_bad_exp(i64 %cnt) nounwind { ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $1, %eax ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax -; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 ; CHECK-FMA-NEXT: vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; CHECK-FMA-NEXT: retq %shl = shl nuw i64 1, %cnt @@ -1317,11 +1317,11 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind { ; CHECK-AVX2-NEXT: testq %rax, %rax ; CHECK-AVX2-NEXT: js .LBB23_1 ; CHECK-AVX2-NEXT: # %bb.2: -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; CHECK-AVX2-NEXT: jmp .LBB23_3 ; CHECK-AVX2-NEXT: .LBB23_1: ; CHECK-AVX2-NEXT: shrq %rax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; CHECK-AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; CHECK-AVX2-NEXT: .LBB23_3: ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] @@ -1334,7 +1334,7 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind { ; CHECK-NO-FASTFMA-NEXT: movl $8, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq @@ -1343,7 +1343,7 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind { ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $8, %eax ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax -; CHECK-FMA-NEXT: vcvtusi2ss %rax, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: retq @@ -1371,7 +1371,7 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind { ; CHECK-AVX2-NEXT: movl $8, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-AVX2-NEXT: shlq %cl, %rax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1382,7 +1382,7 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind { ; CHECK-NO-FASTFMA-NEXT: movl $8, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $rcx ; CHECK-NO-FASTFMA-NEXT: shlq %cl, %rax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq @@ -1391,7 +1391,7 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind { ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $8, %eax ; CHECK-FMA-NEXT: shlxq %rdi, %rax, %rax -; CHECK-FMA-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0] ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: retq @@ -1451,7 +1451,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { ; CHECK-AVX2-NEXT: movl $1, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] @@ -1466,7 +1466,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] @@ -1478,7 +1478,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind { ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $1, %eax ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0] @@ -1562,7 +1562,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-AVX2-NEXT: shll %cl, %eax ; CHECK-AVX2-NEXT: movzwl %ax, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-AVX2-NEXT: callq __truncsfhf2@PLT ; CHECK-AVX2-NEXT: callq __extendhfsf2@PLT ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] @@ -1578,7 +1578,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax ; CHECK-NO-FASTFMA-NEXT: movzwl %ax, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] @@ -1591,7 +1591,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind { ; CHECK-FMA-NEXT: movl $1, %eax ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax ; CHECK-FMA-NEXT: movzwl %ax, %eax -; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-FMA-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-FMA-NEXT: vcvtph2ps %xmm0, %xmm0 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0] @@ -1648,7 +1648,7 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind { ; CHECK-AVX2-NEXT: movl $1, %eax ; CHECK-AVX2-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-AVX2-NEXT: shll %cl, %eax -; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; CHECK-AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; CHECK-AVX2-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0] ; CHECK-AVX2-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-AVX2-NEXT: retq @@ -1659,7 +1659,7 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind { ; CHECK-NO-FASTFMA-NEXT: movl $1, %eax ; CHECK-NO-FASTFMA-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NO-FASTFMA-NEXT: shll %cl, %eax -; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 +; CHECK-NO-FASTFMA-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 ; CHECK-NO-FASTFMA-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0] ; CHECK-NO-FASTFMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-NO-FASTFMA-NEXT: retq @@ -1668,7 +1668,7 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind { ; CHECK-FMA: # %bb.0: ; CHECK-FMA-NEXT: movl $1, %eax ; CHECK-FMA-NEXT: shlxl %edi, %eax, %eax -; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm0, %xmm0 +; CHECK-FMA-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 ; CHECK-FMA-NEXT: vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0] ; CHECK-FMA-NEXT: vdivss %xmm0, %xmm1, %xmm0 ; CHECK-FMA-NEXT: retq diff --git a/llvm/test/CodeGen/X86/fold-load-unops.ll b/llvm/test/CodeGen/X86/fold-load-unops.ll index d3e34f8d8ffd3..35e14e5cf8980 100644 --- a/llvm/test/CodeGen/X86/fold-load-unops.ll +++ b/llvm/test/CodeGen/X86/fold-load-unops.ll @@ -89,7 +89,7 @@ define float @rcpss_size(ptr %a) optsize { ; ; AVX-LABEL: rcpss_size: ; AVX: # %bb.0: -; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vrcpss (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq %ld = load float, ptr %a %ins = insertelement <4 x float> undef, float %ld, i32 0 @@ -106,7 +106,7 @@ define <4 x float> @rcpss_full_size(ptr %a) optsize { ; ; AVX-LABEL: rcpss_full_size: ; AVX: # %bb.0: -; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vrcpss (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq %ld = load <4 x float>, ptr %a %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld) @@ -121,7 +121,7 @@ define float @rcpss_pgso(ptr %a) !prof !14 { ; ; AVX-LABEL: rcpss_pgso: ; AVX: # %bb.0: -; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vrcpss (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq %ld = load float, ptr %a %ins = insertelement <4 x float> undef, float %ld, i32 0 @@ -138,7 +138,7 @@ define <4 x float> @rcpss_full_pgso(ptr %a) !prof !14 { ; ; AVX-LABEL: rcpss_full_pgso: ; AVX: # %bb.0: -; AVX-NEXT: vrcpss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vrcpss (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq %ld = load <4 x float>, ptr %a %res = tail call <4 x float> @llvm.x86.sse.rcp.ss(<4 x float> %ld) @@ -153,7 +153,7 @@ define float @rsqrtss_size(ptr %a) optsize { ; ; AVX-LABEL: rsqrtss_size: ; AVX: # %bb.0: -; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vrsqrtss (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq %ld = load float, ptr %a %ins = insertelement <4 x float> undef, float %ld, i32 0 @@ -170,7 +170,7 @@ define <4 x float> @rsqrtss_full_size(ptr %a) optsize { ; ; AVX-LABEL: rsqrtss_full_size: ; AVX: # %bb.0: -; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vrsqrtss (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq %ld = load <4 x float>, ptr %a %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld) @@ -185,7 +185,7 @@ define float @rsqrtss_pgso(ptr %a) !prof !14 { ; ; AVX-LABEL: rsqrtss_pgso: ; AVX: # %bb.0: -; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vrsqrtss (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq %ld = load float, ptr %a %ins = insertelement <4 x float> undef, float %ld, i32 0 @@ -202,7 +202,7 @@ define <4 x float> @rsqrtss_full_pgso(ptr %a) !prof !14 { ; ; AVX-LABEL: rsqrtss_full_pgso: ; AVX: # %bb.0: -; AVX-NEXT: vrsqrtss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vrsqrtss (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq %ld = load <4 x float>, ptr %a %res = tail call <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float> %ld) @@ -217,7 +217,7 @@ define float @sqrtss_size(ptr %a) optsize{ ; ; AVX-LABEL: sqrtss_size: ; AVX: # %bb.0: -; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vsqrtss (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq %ld = load float, ptr %a %ins = insertelement <4 x float> undef, float %ld, i32 0 @@ -268,7 +268,7 @@ define float @sqrtss_pgso(ptr %a) !prof !14 { ; ; AVX-LABEL: sqrtss_pgso: ; AVX: # %bb.0: -; AVX-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vsqrtss (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq %ld = load float, ptr %a %ins = insertelement <4 x float> undef, float %ld, i32 0 @@ -319,7 +319,7 @@ define double @sqrtsd_size(ptr %a) optsize { ; ; AVX-LABEL: sqrtsd_size: ; AVX: # %bb.0: -; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vsqrtsd (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq %ld = load double, ptr %a %ins = insertelement <2 x double> undef, double %ld, i32 0 @@ -370,7 +370,7 @@ define double @sqrtsd_pgso(ptr %a) !prof !14 { ; ; AVX-LABEL: sqrtsd_pgso: ; AVX: # %bb.0: -; AVX-NEXT: vsqrtsd (%rdi), %xmm0, %xmm0 +; AVX-NEXT: vsqrtsd (%rdi), %xmm15, %xmm0 ; AVX-NEXT: retq %ld = load double, ptr %a %ins = insertelement <2 x double> undef, double %ld, i32 0 diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll index 3577f252f50da..5d69a217fb402 100644 --- a/llvm/test/CodeGen/X86/fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll @@ -2018,7 +2018,7 @@ define double @sifdb(i8 %x) #0 { ; AVX-LABEL: sifdb: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movsbl %dil, %eax -; AVX-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call double @llvm.experimental.constrained.sitofp.f64.i8(i8 %x, @@ -2062,7 +2062,7 @@ define double @sifdw(i16 %x) #0 { ; AVX-LABEL: sifdw: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movswl %di, %eax -; AVX-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call double @llvm.experimental.constrained.sitofp.f64.i16(i16 %x, @@ -2103,7 +2103,7 @@ define double @sifdi(i32 %x) #0 { ; ; AVX-LABEL: sifdi: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sd %edi, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %x, @@ -2147,7 +2147,7 @@ define float @siffb(i8 %x) #0 { ; AVX-LABEL: siffb: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movsbl %dil, %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call float @llvm.experimental.constrained.sitofp.f32.i8(i8 %x, @@ -2191,7 +2191,7 @@ define float @siffw(i16 %x) #0 { ; AVX-LABEL: siffw: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movswl %di, %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call float @llvm.experimental.constrained.sitofp.f32.i16(i16 %x, @@ -2232,7 +2232,7 @@ define float @siffi(i32 %x) #0 { ; ; AVX-LABEL: siffi: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call float @llvm.experimental.constrained.sitofp.f32.i32(i32 %x, @@ -2267,7 +2267,7 @@ define double @sifdl(i64 %x) #0 { ; ; AVX-LABEL: sifdl: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sd %rdi, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call double @llvm.experimental.constrained.sitofp.f64.i64(i64 %x, @@ -2302,7 +2302,7 @@ define float @siffl(i64 %x) #0 { ; ; AVX-LABEL: siffl: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %rdi, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call float @llvm.experimental.constrained.sitofp.f32.i64(i64 %x, @@ -2349,7 +2349,7 @@ define double @uifdb(i8 %x) #0 { ; AVX-LABEL: uifdb: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movzbl %dil, %eax -; AVX-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call double @llvm.experimental.constrained.uitofp.f64.i8(i8 %x, @@ -2393,7 +2393,7 @@ define double @uifdw(i16 %x) #0 { ; AVX-LABEL: uifdw: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call double @llvm.experimental.constrained.uitofp.f64.i16(i16 %x, @@ -2440,12 +2440,12 @@ define double @uifdi(i32 %x) #0 { ; AVX1-LABEL: uifdi: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: uifdi: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtusi2sd %edi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %result = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 %x, @@ -2518,7 +2518,7 @@ define double @uifdl(i64 %x) #0 { ; AVX1-NEXT: orq %rax, %rcx ; AVX1-NEXT: testq %rdi, %rdi ; AVX1-NEXT: cmovnsq %rdi, %rcx -; AVX1-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2sd %rcx, %xmm15, %xmm0 ; AVX1-NEXT: jns .LBB48_2 ; AVX1-NEXT: # %bb.1: ; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0 @@ -2527,7 +2527,7 @@ define double @uifdl(i64 %x) #0 { ; ; AVX512-LABEL: uifdl: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtusi2sd %rdi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %result = call double @llvm.experimental.constrained.uitofp.f64.i64(i64 %x, @@ -2571,7 +2571,7 @@ define float @uiffb(i8 %x) #0 { ; AVX-LABEL: uiffb: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movzbl %dil, %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call float @llvm.experimental.constrained.uitofp.f32.i8(i8 %x, @@ -2615,7 +2615,7 @@ define float @uiffw(i16 %x) #0 { ; AVX-LABEL: uiffw: ; AVX: # %bb.0: # %entry ; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call float @llvm.experimental.constrained.uitofp.f32.i16(i16 %x, @@ -2662,12 +2662,12 @@ define float @uiffi(i32 %x) #0 { ; AVX1-LABEL: uiffi: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: uiffi: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtusi2ss %edi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %result = call float @llvm.experimental.constrained.uitofp.f32.i32(i32 %x, @@ -2740,7 +2740,7 @@ define float @uiffl(i64 %x) #0 { ; AVX1-NEXT: orq %rax, %rcx ; AVX1-NEXT: testq %rdi, %rdi ; AVX1-NEXT: cmovnsq %rdi, %rcx -; AVX1-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ss %rcx, %xmm15, %xmm0 ; AVX1-NEXT: jns .LBB52_2 ; AVX1-NEXT: # %bb.1: ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 @@ -2749,7 +2749,7 @@ define float @uiffl(i64 %x) #0 { ; ; AVX512-LABEL: uiffl: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtusi2ss %rdi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %result = call float @llvm.experimental.constrained.uitofp.f32.i64(i64 %x, diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll index 7c0386f0e784e..c31bee5ff1030 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp-fp16.ll @@ -33,7 +33,7 @@ define half @sitofp_i1tof16(i1 %x) #0 { ; AVX-NEXT: andb $1, %dil ; AVX-NEXT: negb %dil ; AVX-NEXT: movsbl %dil, %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -45,7 +45,7 @@ define half @sitofp_i1tof16(i1 %x) #0 { ; X86-NEXT: andb $1, %al ; X86-NEXT: negb %al ; X86-NEXT: movsbl %al, %eax -; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: vcvtsi2sh %eax, %xmm7, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: sitofp_i1tof16: @@ -53,7 +53,7 @@ define half @sitofp_i1tof16(i1 %x) #0 { ; X64-NEXT: andb $1, %dil ; X64-NEXT: negb %dil ; X64-NEXT: movsbl %dil, %eax -; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; X64-NEXT: retq %result = call half @llvm.experimental.constrained.sitofp.f16.i1(i1 %x, metadata !"round.dynamic", @@ -74,7 +74,7 @@ define half @sitofp_i8tof16(i8 %x) #0 { ; AVX-LABEL: sitofp_i8tof16: ; AVX: # %bb.0: ; AVX-NEXT: movsbl %dil, %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -83,13 +83,13 @@ define half @sitofp_i8tof16(i8 %x) #0 { ; X86-LABEL: sitofp_i8tof16: ; X86: # %bb.0: ; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: vcvtsi2sh %eax, %xmm7, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: sitofp_i8tof16: ; X64: # %bb.0: ; X64-NEXT: movsbl %dil, %eax -; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; X64-NEXT: retq %result = call half @llvm.experimental.constrained.sitofp.f16.i8(i8 %x, metadata !"round.dynamic", @@ -110,7 +110,7 @@ define half @sitofp_i16tof16(i16 %x) #0 { ; AVX-LABEL: sitofp_i16tof16: ; AVX: # %bb.0: ; AVX-NEXT: movswl %di, %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -119,13 +119,13 @@ define half @sitofp_i16tof16(i16 %x) #0 { ; X86-LABEL: sitofp_i16tof16: ; X86: # %bb.0: ; X86-NEXT: movswl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: vcvtsi2sh %eax, %xmm7, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: sitofp_i16tof16: ; X64: # %bb.0: ; X64-NEXT: movswl %di, %eax -; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; X64-NEXT: retq %result = call half @llvm.experimental.constrained.sitofp.f16.i16(i16 %x, metadata !"round.dynamic", @@ -144,7 +144,7 @@ define half @sitofp_i32tof16(i32 %x) #0 { ; ; AVX-LABEL: sitofp_i32tof16: ; AVX: # %bb.0: -; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -152,12 +152,12 @@ define half @sitofp_i32tof16(i32 %x) #0 { ; ; X86-LABEL: sitofp_i32tof16: ; X86: # %bb.0: -; X86-NEXT: vcvtsi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vcvtsi2shl {{[0-9]+}}(%esp), %xmm7, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: sitofp_i32tof16: ; X64: # %bb.0: -; X64-NEXT: vcvtsi2sh %edi, %xmm0, %xmm0 +; X64-NEXT: vcvtsi2sh %edi, %xmm31, %xmm0 ; X64-NEXT: retq %result = call half @llvm.experimental.constrained.sitofp.f16.i32(i32 %x, metadata !"round.dynamic", @@ -176,7 +176,7 @@ define half @sitofp_i64tof16(i64 %x) #0 { ; ; AVX-LABEL: sitofp_i64tof16: ; AVX: # %bb.0: -; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %rdi, %xmm15, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -190,7 +190,7 @@ define half @sitofp_i64tof16(i64 %x) #0 { ; ; X64-LABEL: sitofp_i64tof16: ; X64: # %bb.0: -; X64-NEXT: vcvtsi2sh %rdi, %xmm0, %xmm0 +; X64-NEXT: vcvtsi2sh %rdi, %xmm31, %xmm0 ; X64-NEXT: retq %result = call half @llvm.experimental.constrained.sitofp.f16.i64(i64 %x, metadata !"round.dynamic", @@ -211,7 +211,7 @@ define half @uitofp_i1tof16(i1 %x) #0 { ; AVX-LABEL: uitofp_i1tof16: ; AVX: # %bb.0: ; AVX-NEXT: andl $1, %edi -; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -222,13 +222,13 @@ define half @uitofp_i1tof16(i1 %x) #0 { ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: andb $1, %al ; X86-NEXT: movzbl %al, %eax -; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: vcvtsi2sh %eax, %xmm7, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: uitofp_i1tof16: ; X64: # %bb.0: ; X64-NEXT: andl $1, %edi -; X64-NEXT: vcvtsi2sh %edi, %xmm0, %xmm0 +; X64-NEXT: vcvtsi2sh %edi, %xmm31, %xmm0 ; X64-NEXT: retq %result = call half @llvm.experimental.constrained.uitofp.f16.i1(i1 %x, metadata !"round.dynamic", @@ -249,7 +249,7 @@ define half @uitofp_i8tof16(i8 %x) #0 { ; AVX-LABEL: uitofp_i8tof16: ; AVX: # %bb.0: ; AVX-NEXT: movzbl %dil, %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -258,13 +258,13 @@ define half @uitofp_i8tof16(i8 %x) #0 { ; X86-LABEL: uitofp_i8tof16: ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: vcvtsi2sh %eax, %xmm7, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: uitofp_i8tof16: ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax -; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; X64-NEXT: retq %result = call half @llvm.experimental.constrained.uitofp.f16.i8(i8 %x, metadata !"round.dynamic", @@ -285,7 +285,7 @@ define half @uitofp_i16tof16(i16 %x) #0 { ; AVX-LABEL: uitofp_i16tof16: ; AVX: # %bb.0: ; AVX-NEXT: movzwl %di, %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -294,13 +294,13 @@ define half @uitofp_i16tof16(i16 %x) #0 { ; X86-LABEL: uitofp_i16tof16: ; X86: # %bb.0: ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; X86-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X86-NEXT: vcvtsi2sh %eax, %xmm7, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: uitofp_i16tof16: ; X64: # %bb.0: ; X64-NEXT: movzwl %di, %eax -; X64-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; X64-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; X64-NEXT: retq %result = call half @llvm.experimental.constrained.uitofp.f16.i16(i16 %x, metadata !"round.dynamic", @@ -321,7 +321,7 @@ define half @uitofp_i32tof16(i32 %x) #0 { ; F16C-LABEL: uitofp_i32tof16: ; F16C: # %bb.0: ; F16C-NEXT: movl %edi, %eax -; F16C-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; F16C-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; F16C-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; F16C-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -329,7 +329,7 @@ define half @uitofp_i32tof16(i32 %x) #0 { ; ; AVX512-LABEL: uitofp_i32tof16: ; AVX512: # %bb.0: -; AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtusi2ss %edi, %xmm15, %xmm0 ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -337,12 +337,12 @@ define half @uitofp_i32tof16(i32 %x) #0 { ; ; X86-LABEL: uitofp_i32tof16: ; X86: # %bb.0: -; X86-NEXT: vcvtusi2shl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vcvtusi2shl {{[0-9]+}}(%esp), %xmm7, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: uitofp_i32tof16: ; X64: # %bb.0: -; X64-NEXT: vcvtusi2sh %edi, %xmm0, %xmm0 +; X64-NEXT: vcvtusi2sh %edi, %xmm31, %xmm0 ; X64-NEXT: retq %result = call half @llvm.experimental.constrained.uitofp.f16.i32(i32 %x, metadata !"round.dynamic", @@ -381,7 +381,7 @@ define half @uitofp_i64tof16(i64 %x) #0 { ; F16C-NEXT: orq %rax, %rcx ; F16C-NEXT: testq %rdi, %rdi ; F16C-NEXT: cmovnsq %rdi, %rcx -; F16C-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 +; F16C-NEXT: vcvtsi2ss %rcx, %xmm15, %xmm0 ; F16C-NEXT: jns .LBB9_2 ; F16C-NEXT: # %bb.1: ; F16C-NEXT: vaddss %xmm0, %xmm0, %xmm0 @@ -393,7 +393,7 @@ define half @uitofp_i64tof16(i64 %x) #0 { ; ; AVX512-LABEL: uitofp_i64tof16: ; AVX512: # %bb.0: -; AVX512-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtusi2ss %rdi, %xmm15, %xmm0 ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; AVX512-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; AVX512-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -407,7 +407,7 @@ define half @uitofp_i64tof16(i64 %x) #0 { ; ; X64-LABEL: uitofp_i64tof16: ; X64: # %bb.0: -; X64-NEXT: vcvtusi2sh %rdi, %xmm0, %xmm0 +; X64-NEXT: vcvtusi2sh %rdi, %xmm31, %xmm0 ; X64-NEXT: retq %result = call half @llvm.experimental.constrained.uitofp.f16.i64(i64 %x, metadata !"round.dynamic", diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll index 4933a870ddd87..f0aa3827ce937 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-inttofp.ll @@ -62,7 +62,7 @@ define float @sitofp_i1tof32(i1 %x) #0 { ; AVX-X86-NEXT: andb $1, %al ; AVX-X86-NEXT: negb %al ; AVX-X86-NEXT: movsbl %al, %eax -; AVX-X86-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-X86-NEXT: vcvtsi2ss %eax, %xmm7, %xmm0 ; AVX-X86-NEXT: vmovss %xmm0, (%esp) ; AVX-X86-NEXT: flds (%esp) ; AVX-X86-NEXT: wait @@ -75,7 +75,7 @@ define float @sitofp_i1tof32(i1 %x) #0 { ; AVX-X64-NEXT: andb $1, %dil ; AVX-X64-NEXT: negb %dil ; AVX-X64-NEXT: movsbl %dil, %eax -; AVX-X64-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: sitofp_i1tof32: @@ -123,7 +123,7 @@ define float @sitofp_i8tof32(i8 %x) #0 { ; AVX-X86-NEXT: pushl %eax ; AVX-X86-NEXT: .cfi_def_cfa_offset 8 ; AVX-X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax -; AVX-X86-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-X86-NEXT: vcvtsi2ss %eax, %xmm7, %xmm0 ; AVX-X86-NEXT: vmovss %xmm0, (%esp) ; AVX-X86-NEXT: flds (%esp) ; AVX-X86-NEXT: wait @@ -134,7 +134,7 @@ define float @sitofp_i8tof32(i8 %x) #0 { ; AVX-X64-LABEL: sitofp_i8tof32: ; AVX-X64: # %bb.0: ; AVX-X64-NEXT: movsbl %dil, %eax -; AVX-X64-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: sitofp_i8tof32: @@ -179,7 +179,7 @@ define float @sitofp_i16tof32(i16 %x) #0 { ; AVX-X86-NEXT: pushl %eax ; AVX-X86-NEXT: .cfi_def_cfa_offset 8 ; AVX-X86-NEXT: movswl {{[0-9]+}}(%esp), %eax -; AVX-X86-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-X86-NEXT: vcvtsi2ss %eax, %xmm7, %xmm0 ; AVX-X86-NEXT: vmovss %xmm0, (%esp) ; AVX-X86-NEXT: flds (%esp) ; AVX-X86-NEXT: wait @@ -190,7 +190,7 @@ define float @sitofp_i16tof32(i16 %x) #0 { ; AVX-X64-LABEL: sitofp_i16tof32: ; AVX-X64: # %bb.0: ; AVX-X64-NEXT: movswl %di, %eax -; AVX-X64-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: sitofp_i16tof32: @@ -232,7 +232,7 @@ define float @sitofp_i32tof32(i32 %x) #0 { ; AVX-X86: # %bb.0: ; AVX-X86-NEXT: pushl %eax ; AVX-X86-NEXT: .cfi_def_cfa_offset 8 -; AVX-X86-NEXT: vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX-X86-NEXT: vcvtsi2ssl {{[0-9]+}}(%esp), %xmm7, %xmm0 ; AVX-X86-NEXT: vmovss %xmm0, (%esp) ; AVX-X86-NEXT: flds (%esp) ; AVX-X86-NEXT: wait @@ -242,7 +242,7 @@ define float @sitofp_i32tof32(i32 %x) #0 { ; ; AVX-X64-LABEL: sitofp_i32tof32: ; AVX-X64: # %bb.0: -; AVX-X64-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: sitofp_i32tof32: @@ -294,7 +294,7 @@ define float @sitofp_i64tof32(i64 %x) #0 { ; ; AVX-X64-LABEL: sitofp_i64tof32: ; AVX-X64: # %bb.0: -; AVX-X64-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2ss %rdi, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: sitofp_i64tof32: @@ -337,7 +337,7 @@ define float @uitofp_i1tof32(i1 %x) #0 { ; AVX-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; AVX-X86-NEXT: andb $1, %al ; AVX-X86-NEXT: movzbl %al, %eax -; AVX-X86-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-X86-NEXT: vcvtsi2ss %eax, %xmm7, %xmm0 ; AVX-X86-NEXT: vmovss %xmm0, (%esp) ; AVX-X86-NEXT: flds (%esp) ; AVX-X86-NEXT: wait @@ -348,7 +348,7 @@ define float @uitofp_i1tof32(i1 %x) #0 { ; AVX-X64-LABEL: uitofp_i1tof32: ; AVX-X64: # %bb.0: ; AVX-X64-NEXT: andl $1, %edi -; AVX-X64-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: uitofp_i1tof32: @@ -395,7 +395,7 @@ define float @uitofp_i8tof32(i8 %x) #0 { ; AVX-X86-NEXT: pushl %eax ; AVX-X86-NEXT: .cfi_def_cfa_offset 8 ; AVX-X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; AVX-X86-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-X86-NEXT: vcvtsi2ss %eax, %xmm7, %xmm0 ; AVX-X86-NEXT: vmovss %xmm0, (%esp) ; AVX-X86-NEXT: flds (%esp) ; AVX-X86-NEXT: wait @@ -406,7 +406,7 @@ define float @uitofp_i8tof32(i8 %x) #0 { ; AVX-X64-LABEL: uitofp_i8tof32: ; AVX-X64: # %bb.0: ; AVX-X64-NEXT: movzbl %dil, %eax -; AVX-X64-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: uitofp_i8tof32: @@ -451,7 +451,7 @@ define float @uitofp_i16tof32(i16 %x) #0 { ; AVX-X86-NEXT: pushl %eax ; AVX-X86-NEXT: .cfi_def_cfa_offset 8 ; AVX-X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax -; AVX-X86-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-X86-NEXT: vcvtsi2ss %eax, %xmm7, %xmm0 ; AVX-X86-NEXT: vmovss %xmm0, (%esp) ; AVX-X86-NEXT: flds (%esp) ; AVX-X86-NEXT: wait @@ -462,7 +462,7 @@ define float @uitofp_i16tof32(i16 %x) #0 { ; AVX-X64-LABEL: uitofp_i16tof32: ; AVX-X64: # %bb.0: ; AVX-X64-NEXT: movzwl %di, %eax -; AVX-X64-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: uitofp_i16tof32: @@ -534,14 +534,14 @@ define float @uitofp_i32tof32(i32 %x) #0 { ; AVX1-X64-LABEL: uitofp_i32tof32: ; AVX1-X64: # %bb.0: ; AVX1-X64-NEXT: movl %edi, %eax -; AVX1-X64-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; AVX1-X64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX1-X64-NEXT: retq ; ; AVX512-X86-LABEL: uitofp_i32tof32: ; AVX512-X86: # %bb.0: ; AVX512-X86-NEXT: pushl %eax ; AVX512-X86-NEXT: .cfi_def_cfa_offset 8 -; AVX512-X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512-X86-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm7, %xmm0 ; AVX512-X86-NEXT: vmovss %xmm0, (%esp) ; AVX512-X86-NEXT: flds (%esp) ; AVX512-X86-NEXT: wait @@ -551,7 +551,7 @@ define float @uitofp_i32tof32(i32 %x) #0 { ; ; AVX512-X64-LABEL: uitofp_i32tof32: ; AVX512-X64: # %bb.0: -; AVX512-X64-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; AVX512-X64-NEXT: vcvtusi2ss %edi, %xmm15, %xmm0 ; AVX512-X64-NEXT: retq ; ; X87-LABEL: uitofp_i32tof32: @@ -656,7 +656,7 @@ define float @uitofp_i64tof32(i64 %x) #0 { ; AVX1-X64-NEXT: orq %rax, %rcx ; AVX1-X64-NEXT: testq %rdi, %rdi ; AVX1-X64-NEXT: cmovnsq %rdi, %rcx -; AVX1-X64-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 +; AVX1-X64-NEXT: vcvtsi2ss %rcx, %xmm15, %xmm0 ; AVX1-X64-NEXT: jns .LBB9_2 ; AVX1-X64-NEXT: # %bb.1: ; AVX1-X64-NEXT: vaddss %xmm0, %xmm0, %xmm0 @@ -665,7 +665,7 @@ define float @uitofp_i64tof32(i64 %x) #0 { ; ; AVX512-X64-LABEL: uitofp_i64tof32: ; AVX512-X64: # %bb.0: -; AVX512-X64-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 +; AVX512-X64-NEXT: vcvtusi2ss %rdi, %xmm15, %xmm0 ; AVX512-X64-NEXT: retq ; ; X87-LABEL: uitofp_i64tof32: @@ -733,7 +733,7 @@ define double @sitofp_i8tof64(i8 %x) #0 { ; AVX-X86-NEXT: andl $-8, %esp ; AVX-X86-NEXT: subl $8, %esp ; AVX-X86-NEXT: movsbl 8(%ebp), %eax -; AVX-X86-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-X86-NEXT: vcvtsi2sd %eax, %xmm7, %xmm0 ; AVX-X86-NEXT: vmovsd %xmm0, (%esp) ; AVX-X86-NEXT: fldl (%esp) ; AVX-X86-NEXT: wait @@ -745,7 +745,7 @@ define double @sitofp_i8tof64(i8 %x) #0 { ; AVX-X64-LABEL: sitofp_i8tof64: ; AVX-X64: # %bb.0: ; AVX-X64-NEXT: movsbl %dil, %eax -; AVX-X64-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: sitofp_i8tof64: @@ -801,7 +801,7 @@ define double @sitofp_i16tof64(i16 %x) #0 { ; AVX-X86-NEXT: andl $-8, %esp ; AVX-X86-NEXT: subl $8, %esp ; AVX-X86-NEXT: movswl 8(%ebp), %eax -; AVX-X86-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-X86-NEXT: vcvtsi2sd %eax, %xmm7, %xmm0 ; AVX-X86-NEXT: vmovsd %xmm0, (%esp) ; AVX-X86-NEXT: fldl (%esp) ; AVX-X86-NEXT: wait @@ -813,7 +813,7 @@ define double @sitofp_i16tof64(i16 %x) #0 { ; AVX-X64-LABEL: sitofp_i16tof64: ; AVX-X64: # %bb.0: ; AVX-X64-NEXT: movswl %di, %eax -; AVX-X64-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: sitofp_i16tof64: @@ -866,7 +866,7 @@ define double @sitofp_i32tof64(i32 %x) #0 { ; AVX-X86-NEXT: .cfi_def_cfa_register %ebp ; AVX-X86-NEXT: andl $-8, %esp ; AVX-X86-NEXT: subl $8, %esp -; AVX-X86-NEXT: vcvtsi2sdl 8(%ebp), %xmm0, %xmm0 +; AVX-X86-NEXT: vcvtsi2sdl 8(%ebp), %xmm7, %xmm0 ; AVX-X86-NEXT: vmovsd %xmm0, (%esp) ; AVX-X86-NEXT: fldl (%esp) ; AVX-X86-NEXT: wait @@ -877,7 +877,7 @@ define double @sitofp_i32tof64(i32 %x) #0 { ; ; AVX-X64-LABEL: sitofp_i32tof64: ; AVX-X64: # %bb.0: -; AVX-X64-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2sd %edi, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: sitofp_i32tof64: @@ -941,7 +941,7 @@ define double @sitofp_i64tof64(i64 %x) #0 { ; ; AVX-X64-LABEL: sitofp_i64tof64: ; AVX-X64: # %bb.0: -; AVX-X64-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2sd %rdi, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: sitofp_i64tof64: @@ -995,7 +995,7 @@ define double @uitofp_i1tof64(i1 %x) #0 { ; AVX-X86-NEXT: movzbl 8(%ebp), %eax ; AVX-X86-NEXT: andb $1, %al ; AVX-X86-NEXT: movzbl %al, %eax -; AVX-X86-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-X86-NEXT: vcvtsi2sd %eax, %xmm7, %xmm0 ; AVX-X86-NEXT: vmovsd %xmm0, (%esp) ; AVX-X86-NEXT: fldl (%esp) ; AVX-X86-NEXT: wait @@ -1007,7 +1007,7 @@ define double @uitofp_i1tof64(i1 %x) #0 { ; AVX-X64-LABEL: uitofp_i1tof64: ; AVX-X64: # %bb.0: ; AVX-X64-NEXT: andl $1, %edi -; AVX-X64-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2sd %edi, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: uitofp_i1tof64: @@ -1065,7 +1065,7 @@ define double @uitofp_i8tof64(i8 %x) #0 { ; AVX-X86-NEXT: andl $-8, %esp ; AVX-X86-NEXT: subl $8, %esp ; AVX-X86-NEXT: movzbl 8(%ebp), %eax -; AVX-X86-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-X86-NEXT: vcvtsi2sd %eax, %xmm7, %xmm0 ; AVX-X86-NEXT: vmovsd %xmm0, (%esp) ; AVX-X86-NEXT: fldl (%esp) ; AVX-X86-NEXT: wait @@ -1077,7 +1077,7 @@ define double @uitofp_i8tof64(i8 %x) #0 { ; AVX-X64-LABEL: uitofp_i8tof64: ; AVX-X64: # %bb.0: ; AVX-X64-NEXT: movzbl %dil, %eax -; AVX-X64-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: uitofp_i8tof64: @@ -1133,7 +1133,7 @@ define double @uitofp_i16tof64(i16 %x) #0 { ; AVX-X86-NEXT: andl $-8, %esp ; AVX-X86-NEXT: subl $8, %esp ; AVX-X86-NEXT: movzwl 8(%ebp), %eax -; AVX-X86-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-X86-NEXT: vcvtsi2sd %eax, %xmm7, %xmm0 ; AVX-X86-NEXT: vmovsd %xmm0, (%esp) ; AVX-X86-NEXT: fldl (%esp) ; AVX-X86-NEXT: wait @@ -1145,7 +1145,7 @@ define double @uitofp_i16tof64(i16 %x) #0 { ; AVX-X64-LABEL: uitofp_i16tof64: ; AVX-X64: # %bb.0: ; AVX-X64-NEXT: movzwl %di, %eax -; AVX-X64-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-X64-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 ; AVX-X64-NEXT: retq ; ; X87-LABEL: uitofp_i16tof64: @@ -1217,7 +1217,7 @@ define double @uitofp_i32tof64(i32 %x) #0 { ; AVX1-X64-LABEL: uitofp_i32tof64: ; AVX1-X64: # %bb.0: ; AVX1-X64-NEXT: movl %edi, %eax -; AVX1-X64-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0 +; AVX1-X64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX1-X64-NEXT: retq ; ; AVX512-X86-LABEL: uitofp_i32tof64: @@ -1229,7 +1229,7 @@ define double @uitofp_i32tof64(i32 %x) #0 { ; AVX512-X86-NEXT: .cfi_def_cfa_register %ebp ; AVX512-X86-NEXT: andl $-8, %esp ; AVX512-X86-NEXT: subl $8, %esp -; AVX512-X86-NEXT: vcvtusi2sdl 8(%ebp), %xmm0, %xmm0 +; AVX512-X86-NEXT: vcvtusi2sdl 8(%ebp), %xmm7, %xmm0 ; AVX512-X86-NEXT: vmovsd %xmm0, (%esp) ; AVX512-X86-NEXT: fldl (%esp) ; AVX512-X86-NEXT: wait @@ -1240,7 +1240,7 @@ define double @uitofp_i32tof64(i32 %x) #0 { ; ; AVX512-X64-LABEL: uitofp_i32tof64: ; AVX512-X64: # %bb.0: -; AVX512-X64-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; AVX512-X64-NEXT: vcvtusi2sd %edi, %xmm15, %xmm0 ; AVX512-X64-NEXT: retq ; ; X87-LABEL: uitofp_i32tof64: @@ -1345,7 +1345,7 @@ define double @uitofp_i64tof64(i64 %x) #0 { ; AVX1-X64-NEXT: orq %rax, %rcx ; AVX1-X64-NEXT: testq %rdi, %rdi ; AVX1-X64-NEXT: cmovnsq %rdi, %rcx -; AVX1-X64-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0 +; AVX1-X64-NEXT: vcvtsi2sd %rcx, %xmm15, %xmm0 ; AVX1-X64-NEXT: jns .LBB18_2 ; AVX1-X64-NEXT: # %bb.1: ; AVX1-X64-NEXT: vaddsd %xmm0, %xmm0, %xmm0 @@ -1354,7 +1354,7 @@ define double @uitofp_i64tof64(i64 %x) #0 { ; ; AVX512-X64-LABEL: uitofp_i64tof64: ; AVX512-X64: # %bb.0: -; AVX512-X64-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 +; AVX512-X64-NEXT: vcvtusi2sd %rdi, %xmm15, %xmm0 ; AVX512-X64-NEXT: retq ; ; X87-LABEL: uitofp_i64tof64: diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll index 1ab97dafb8514..c834ddbf46f7b 100644 --- a/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll +++ b/llvm/test/CodeGen/X86/fp-strict-scalar-round-fp16.ll @@ -37,7 +37,7 @@ define half @fceil32(half %f) #0 { ; ; X86-LABEL: fceil32: ; X86: # %bb.0: -; X86-NEXT: vrndscalesh $10, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vrndscalesh $10, {{[0-9]+}}(%esp), %xmm7, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: fceil32: @@ -73,7 +73,7 @@ define half @ffloor32(half %f) #0 { ; ; X86-LABEL: ffloor32: ; X86: # %bb.0: -; X86-NEXT: vrndscalesh $9, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vrndscalesh $9, {{[0-9]+}}(%esp), %xmm7, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: ffloor32: @@ -109,7 +109,7 @@ define half @ftrunc32(half %f) #0 { ; ; X86-LABEL: ftrunc32: ; X86: # %bb.0: -; X86-NEXT: vrndscalesh $11, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vrndscalesh $11, {{[0-9]+}}(%esp), %xmm7, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: ftrunc32: @@ -145,7 +145,7 @@ define half @frint32(half %f) #0 { ; ; X86-LABEL: frint32: ; X86: # %bb.0: -; X86-NEXT: vrndscalesh $4, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vrndscalesh $4, {{[0-9]+}}(%esp), %xmm7, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: frint32: @@ -182,7 +182,7 @@ define half @fnearbyint32(half %f) #0 { ; ; X86-LABEL: fnearbyint32: ; X86: # %bb.0: -; X86-NEXT: vrndscalesh $12, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vrndscalesh $12, {{[0-9]+}}(%esp), %xmm7, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: fnearbyint32: @@ -219,7 +219,7 @@ define half @froundeven16(half %f) #0 { ; ; X86-LABEL: froundeven16: ; X86: # %bb.0: -; X86-NEXT: vrndscalesh $8, {{[0-9]+}}(%esp), %xmm0, %xmm0 +; X86-NEXT: vrndscalesh $8, {{[0-9]+}}(%esp), %xmm7, %xmm0 ; X86-NEXT: retl ; ; X64-LABEL: froundeven16: diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll index da44b5ec1371e..3ed98589767fb 100644 --- a/llvm/test/CodeGen/X86/ftrunc.ll +++ b/llvm/test/CodeGen/X86/ftrunc.ll @@ -514,7 +514,7 @@ define double @trunc_signed_f64_no_fast_math(double %x) { ; X64-AVX1-LABEL: trunc_signed_f64_no_fast_math: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vcvttsd2si %xmm0, %rax -; X64-AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0 +; X64-AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; X64-AVX1-NEXT: retq ; ; X86-AVX1-LABEL: trunc_signed_f64_no_fast_math: @@ -695,7 +695,7 @@ define float @trunc_unsigned_f32_disable_via_intrinsic(float %x) #0 { ; X64-AVX1-NEXT: vucomiss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; X64-AVX1-NEXT: movl $-1, %eax ; X64-AVX1-NEXT: cmovbel %ecx, %eax -; X64-AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; X64-AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; X64-AVX1-NEXT: retq ; ; X86-AVX1-LABEL: trunc_unsigned_f32_disable_via_intrinsic: @@ -752,7 +752,7 @@ define double @trunc_signed_f64_disable_via_intrinsic(double %x) #0 { ; X64-AVX1-NEXT: xorl %eax, %eax ; X64-AVX1-NEXT: vucomisd %xmm0, %xmm0 ; X64-AVX1-NEXT: cmovnpq %rcx, %rax -; X64-AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0 +; X64-AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; X64-AVX1-NEXT: retq ; ; X86-AVX1-LABEL: trunc_signed_f64_disable_via_intrinsic: diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll index d8686b8b2950f..b6a4a12eb0fac 100644 --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -262,7 +262,7 @@ define void @test_sitofp_i64(i64 %a, ptr %p) #0 { ; ; BWON-F16C-LABEL: test_sitofp_i64: ; BWON-F16C: # %bb.0: -; BWON-F16C-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtsi2ss %rdi, %xmm15, %xmm0 ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; BWON-F16C-NEXT: vpextrw $0, %xmm0, (%rsi) ; BWON-F16C-NEXT: retq @@ -385,14 +385,14 @@ define void @test_uitofp_i64(i64 %a, ptr %p) #0 { ; BWON-F16C-NEXT: testq %rdi, %rdi ; BWON-F16C-NEXT: js .LBB10_1 ; BWON-F16C-NEXT: # %bb.2: -; BWON-F16C-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtsi2ss %rdi, %xmm15, %xmm0 ; BWON-F16C-NEXT: jmp .LBB10_3 ; BWON-F16C-NEXT: .LBB10_1: ; BWON-F16C-NEXT: movq %rdi, %rax ; BWON-F16C-NEXT: shrq %rax ; BWON-F16C-NEXT: andl $1, %edi ; BWON-F16C-NEXT: orq %rax, %rdi -; BWON-F16C-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; BWON-F16C-NEXT: vcvtsi2ss %rdi, %xmm15, %xmm0 ; BWON-F16C-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; BWON-F16C-NEXT: .LBB10_3: ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 @@ -843,7 +843,7 @@ define float @test_sitofp_fadd_i32(i32 %a, ptr %b) #0 { ; BWON-F16C-LABEL: test_sitofp_fadd_i32: ; BWON-F16C: # %bb.0: ; BWON-F16C-NEXT: vpinsrw $0, (%rsi), %xmm0, %xmm0 -; BWON-F16C-NEXT: vcvtsi2ss %edi, %xmm1, %xmm1 +; BWON-F16C-NEXT: vcvtsi2ss %edi, %xmm15, %xmm1 ; BWON-F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; BWON-F16C-NEXT: vcvtph2ps %xmm1, %xmm1 ; BWON-F16C-NEXT: vcvtph2ps %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/isel-int-to-fp.ll b/llvm/test/CodeGen/X86/isel-int-to-fp.ll index fc99ff95788f3..5884944e41986 100644 --- a/llvm/test/CodeGen/X86/isel-int-to-fp.ll +++ b/llvm/test/CodeGen/X86/isel-int-to-fp.ll @@ -33,7 +33,7 @@ define double @test_ui64_to_double(i64 %x) { ; ; AVX512-LABEL: test_ui64_to_double: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtusi2sd %rdi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = uitofp i64 %x to double @@ -49,7 +49,7 @@ define double @test_ui32_to_double(i32 %x) { ; ; AVX512-LABEL: test_ui32_to_double: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtusi2sd %edi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = uitofp i32 %x to double @@ -64,12 +64,12 @@ define double @test_ui16_to_double(i16 zeroext %x) { ; ; SDAG-AVX512-LABEL: test_ui16_to_double: ; SDAG-AVX512: # %bb.0: # %entry -; SDAG-AVX512-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; SDAG-AVX512-NEXT: vcvtsi2sd %edi, %xmm15, %xmm0 ; SDAG-AVX512-NEXT: retq ; ; GISEL-AVX512-LABEL: test_ui16_to_double: ; GISEL-AVX512: # %bb.0: # %entry -; GISEL-AVX512-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; GISEL-AVX512-NEXT: vcvtusi2sd %edi, %xmm15, %xmm0 ; GISEL-AVX512-NEXT: retq entry: %conv = uitofp i16 %x to double @@ -84,12 +84,12 @@ define double @test_ui8_to_double(i8 zeroext %x) { ; ; SDAG-AVX512-LABEL: test_ui8_to_double: ; SDAG-AVX512: # %bb.0: # %entry -; SDAG-AVX512-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; SDAG-AVX512-NEXT: vcvtsi2sd %edi, %xmm15, %xmm0 ; SDAG-AVX512-NEXT: retq ; ; GISEL-AVX512-LABEL: test_ui8_to_double: ; GISEL-AVX512: # %bb.0: # %entry -; GISEL-AVX512-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; GISEL-AVX512-NEXT: vcvtusi2sd %edi, %xmm15, %xmm0 ; GISEL-AVX512-NEXT: retq entry: %conv = uitofp i8 %x to double @@ -135,7 +135,7 @@ define float @test_ui64_to_float(i64 %x) { ; ; AVX512-LABEL: test_ui64_to_float: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtusi2ss %rdi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = uitofp i64 %x to float @@ -151,7 +151,7 @@ define float @test_ui32_to_float(i32 %x) { ; ; AVX512-LABEL: test_ui32_to_float: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtusi2ss %edi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = uitofp i32 %x to float @@ -166,12 +166,12 @@ define float @test_ui16_to_float(i16 zeroext %x) { ; ; SDAG-AVX512-LABEL: test_ui16_to_float: ; SDAG-AVX512: # %bb.0: # %entry -; SDAG-AVX512-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; SDAG-AVX512-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0 ; SDAG-AVX512-NEXT: retq ; ; GISEL-AVX512-LABEL: test_ui16_to_float: ; GISEL-AVX512: # %bb.0: # %entry -; GISEL-AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; GISEL-AVX512-NEXT: vcvtusi2ss %edi, %xmm15, %xmm0 ; GISEL-AVX512-NEXT: retq entry: %conv = uitofp i16 %x to float @@ -186,12 +186,12 @@ define float @test_ui8_to_float(i8 zeroext %x) { ; ; SDAG-AVX512-LABEL: test_ui8_to_float: ; SDAG-AVX512: # %bb.0: # %entry -; SDAG-AVX512-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; SDAG-AVX512-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0 ; SDAG-AVX512-NEXT: retq ; ; GISEL-AVX512-LABEL: test_ui8_to_float: ; GISEL-AVX512: # %bb.0: # %entry -; GISEL-AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; GISEL-AVX512-NEXT: vcvtusi2ss %edi, %xmm15, %xmm0 ; GISEL-AVX512-NEXT: retq entry: %conv = uitofp i8 %x to float @@ -206,7 +206,7 @@ define double @test_si64_to_double(i64 %x) { ; ; AVX512-LABEL: test_si64_to_double: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2sd %rdi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = sitofp i64 %x to double @@ -221,7 +221,7 @@ define double @test_si32_to_double(i32 %x) { ; ; AVX512-LABEL: test_si32_to_double: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2sd %edi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = sitofp i32 %x to double @@ -236,7 +236,7 @@ define double @test_si16_to_double(i16 signext %x) { ; ; AVX512-LABEL: test_si16_to_double: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2sd %edi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = sitofp i16 %x to double @@ -251,7 +251,7 @@ define double @test_si8_to_double(i8 signext %x) { ; ; AVX512-LABEL: test_si8_to_double: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2sd %edi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = sitofp i8 %x to double @@ -270,7 +270,7 @@ define double @test_si31_to_double(i31 %x) { ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: addl %edi, %edi ; AVX512-NEXT: sarl %edi -; AVX512-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2sd %edi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = sitofp i31 %x to double @@ -289,7 +289,7 @@ define double @test_si33_to_double(i33 %x) { ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: shlq $31, %rdi ; AVX512-NEXT: sarq $31, %rdi -; AVX512-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2sd %rdi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = sitofp i33 %x to double @@ -304,7 +304,7 @@ define float @test_si64_to_float(i64 %x) { ; ; AVX512-LABEL: test_si64_to_float: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2ss %rdi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = sitofp i64 %x to float @@ -319,7 +319,7 @@ define float @test_si32_to_float(i32 %x) { ; ; AVX512-LABEL: test_si32_to_float: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = sitofp i32 %x to float @@ -334,7 +334,7 @@ define float @test_si16_to_float(i16 signext %x) { ; ; AVX512-LABEL: test_si16_to_float: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = sitofp i16 %x to float @@ -349,7 +349,7 @@ define float @test_si8_to_float(i8 signext %x) { ; ; AVX512-LABEL: test_si8_to_float: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = sitofp i8 %x to float @@ -368,7 +368,7 @@ define float @test_si31_to_float(i31 %x) { ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: addl %edi, %edi ; AVX512-NEXT: sarl %edi -; AVX512-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = sitofp i31 %x to float @@ -387,7 +387,7 @@ define float @test_si33_to_float(i33 %x) { ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: shlq $31, %rdi ; AVX512-NEXT: sarq $31, %rdi -; AVX512-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtsi2ss %rdi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %conv = sitofp i33 %x to float diff --git a/llvm/test/CodeGen/X86/pr34080.ll b/llvm/test/CodeGen/X86/pr34080.ll index 436b54db333b3..d07d1aaf6fc0a 100644 --- a/llvm/test/CodeGen/X86/pr34080.ll +++ b/llvm/test/CodeGen/X86/pr34080.ll @@ -124,7 +124,7 @@ define void @_Z1fe(x86_fp80 %z) local_unnamed_addr #0 { ; AVX-NEXT: fldt 16(%rbp) ; AVX-NEXT: fld %st(0) ; AVX-NEXT: fisttpl -4(%rbp) -; AVX-NEXT: vcvtsi2sdl -4(%rbp), %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sdl -4(%rbp), %xmm15, %xmm0 ; AVX-NEXT: vmovsd %xmm0, -48(%rbp) ; AVX-NEXT: vmovsd %xmm0, -24(%rbp) ; AVX-NEXT: fsubl -24(%rbp) @@ -132,7 +132,7 @@ define void @_Z1fe(x86_fp80 %z) local_unnamed_addr #0 { ; AVX-NEXT: fmul %st, %st(1) ; AVX-NEXT: fld %st(1) ; AVX-NEXT: fisttpl -8(%rbp) -; AVX-NEXT: vcvtsi2sdl -8(%rbp), %xmm1, %xmm0 +; AVX-NEXT: vcvtsi2sdl -8(%rbp), %xmm15, %xmm0 ; AVX-NEXT: vmovsd %xmm0, -40(%rbp) ; AVX-NEXT: vmovsd %xmm0, -16(%rbp) ; AVX-NEXT: fxch %st(1) diff --git a/llvm/test/CodeGen/X86/pr37879.ll b/llvm/test/CodeGen/X86/pr37879.ll index 60ca7c5b6d22b..34cbccca2867b 100644 --- a/llvm/test/CodeGen/X86/pr37879.ll +++ b/llvm/test/CodeGen/X86/pr37879.ll @@ -5,7 +5,7 @@ define double @foo(ptr nocapture readonly) #0 { ; CHECK-LABEL: foo: ; CHECK: ## %bb.0: ; CHECK-NEXT: movq (%rax), %rax -; CHECK-NEXT: vcvtsi2sd %rax, %xmm0, %xmm1 +; CHECK-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0] ; CHECK-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} diff --git a/llvm/test/CodeGen/X86/pr38803.ll b/llvm/test/CodeGen/X86/pr38803.ll index ebac8121df913..3efe9f8dfa55d 100644 --- a/llvm/test/CodeGen/X86/pr38803.ll +++ b/llvm/test/CodeGen/X86/pr38803.ll @@ -17,7 +17,7 @@ define dso_local float @_Z3fn2v() { ; CHECK-NEXT: cmpl $0, c(%rip) ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.then -; CHECK-NEXT: vcvtsi2ssl b(%rip), %xmm1, %xmm1 +; CHECK-NEXT: vcvtsi2ssl b(%rip), %xmm15, %xmm1 ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} diff --git a/llvm/test/CodeGen/X86/rounding-ops.ll b/llvm/test/CodeGen/X86/rounding-ops.ll index 04d6d1fdb1418..948449c68b3f0 100644 --- a/llvm/test/CodeGen/X86/rounding-ops.ll +++ b/llvm/test/CodeGen/X86/rounding-ops.ll @@ -221,12 +221,12 @@ define float @test11(ptr %xptr) nounwind optsize { ; ; CHECK-AVX-LABEL: test11: ; CHECK-AVX: ## %bb.0: -; CHECK-AVX-NEXT: vroundss $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX-NEXT: vroundss $11, (%rdi), %xmm15, %xmm0 ; CHECK-AVX-NEXT: retq ; ; CHECK-AVX512-LABEL: test11: ; CHECK-AVX512: ## %bb.0: -; CHECK-AVX512-NEXT: vroundss $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vroundss $11, (%rdi), %xmm15, %xmm0 ; CHECK-AVX512-NEXT: retq %x = load float, ptr %xptr %call = tail call float @truncf(float %x) nounwind readnone @@ -241,12 +241,12 @@ define double @test12(ptr %xptr) nounwind optsize { ; ; CHECK-AVX-LABEL: test12: ; CHECK-AVX: ## %bb.0: -; CHECK-AVX-NEXT: vroundsd $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX-NEXT: vroundsd $11, (%rdi), %xmm15, %xmm0 ; CHECK-AVX-NEXT: retq ; ; CHECK-AVX512-LABEL: test12: ; CHECK-AVX512: ## %bb.0: -; CHECK-AVX512-NEXT: vroundsd $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vroundsd $11, (%rdi), %xmm15, %xmm0 ; CHECK-AVX512-NEXT: retq %x = load double, ptr %xptr %call = tail call double @trunc(double %x) nounwind readnone @@ -261,12 +261,12 @@ define float @test11_pgso(ptr %xptr) nounwind !prof !14 { ; ; CHECK-AVX-LABEL: test11_pgso: ; CHECK-AVX: ## %bb.0: -; CHECK-AVX-NEXT: vroundss $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX-NEXT: vroundss $11, (%rdi), %xmm15, %xmm0 ; CHECK-AVX-NEXT: retq ; ; CHECK-AVX512-LABEL: test11_pgso: ; CHECK-AVX512: ## %bb.0: -; CHECK-AVX512-NEXT: vroundss $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vroundss $11, (%rdi), %xmm15, %xmm0 ; CHECK-AVX512-NEXT: retq %x = load float, ptr %xptr %call = tail call float @truncf(float %x) nounwind readnone @@ -281,12 +281,12 @@ define double @test12_pgso(ptr %xptr) nounwind !prof !14 { ; ; CHECK-AVX-LABEL: test12_pgso: ; CHECK-AVX: ## %bb.0: -; CHECK-AVX-NEXT: vroundsd $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX-NEXT: vroundsd $11, (%rdi), %xmm15, %xmm0 ; CHECK-AVX-NEXT: retq ; ; CHECK-AVX512-LABEL: test12_pgso: ; CHECK-AVX512: ## %bb.0: -; CHECK-AVX512-NEXT: vroundsd $11, (%rdi), %xmm0, %xmm0 +; CHECK-AVX512-NEXT: vroundsd $11, (%rdi), %xmm15, %xmm0 ; CHECK-AVX512-NEXT: retq %x = load double, ptr %xptr %call = tail call double @trunc(double %x) nounwind readnone diff --git a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll index 0757d30296e24..43c1a84f7cd6c 100644 --- a/llvm/test/CodeGen/X86/scalar-int-to-fp.ll +++ b/llvm/test/CodeGen/X86/scalar-int-to-fp.ll @@ -18,7 +18,7 @@ define float @u32_to_f(i32 %a) nounwind { ; AVX512_32-LABEL: u32_to_f: ; AVX512_32: # %bb.0: ; AVX512_32-NEXT: pushl %eax -; AVX512_32-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512_32-NEXT: vcvtusi2ssl {{[0-9]+}}(%esp), %xmm7, %xmm0 ; AVX512_32-NEXT: vmovss %xmm0, (%esp) ; AVX512_32-NEXT: flds (%esp) ; AVX512_32-NEXT: popl %eax @@ -26,7 +26,7 @@ define float @u32_to_f(i32 %a) nounwind { ; ; AVX512_64-LABEL: u32_to_f: ; AVX512_64: # %bb.0: -; AVX512_64-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; AVX512_64-NEXT: vcvtusi2ss %edi, %xmm15, %xmm0 ; AVX512_64-NEXT: retq ; ; SSE2_32-LABEL: u32_to_f: @@ -84,7 +84,7 @@ define float @s32_to_f(i32 %a) nounwind { ; AVX512_32-LABEL: s32_to_f: ; AVX512_32: # %bb.0: ; AVX512_32-NEXT: pushl %eax -; AVX512_32-NEXT: vcvtsi2ssl {{[0-9]+}}(%esp), %xmm0, %xmm0 +; AVX512_32-NEXT: vcvtsi2ssl {{[0-9]+}}(%esp), %xmm7, %xmm0 ; AVX512_32-NEXT: vmovss %xmm0, (%esp) ; AVX512_32-NEXT: flds (%esp) ; AVX512_32-NEXT: popl %eax @@ -92,7 +92,7 @@ define float @s32_to_f(i32 %a) nounwind { ; ; AVX512_64-LABEL: s32_to_f: ; AVX512_64: # %bb.0: -; AVX512_64-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX512_64-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0 ; AVX512_64-NEXT: retq ; ; SSE_32-LABEL: s32_to_f: @@ -128,7 +128,7 @@ define double @u32_to_d(i32 %a) nounwind { ; AVX512_32-NEXT: movl %esp, %ebp ; AVX512_32-NEXT: andl $-8, %esp ; AVX512_32-NEXT: subl $8, %esp -; AVX512_32-NEXT: vcvtusi2sdl 8(%ebp), %xmm0, %xmm0 +; AVX512_32-NEXT: vcvtusi2sdl 8(%ebp), %xmm7, %xmm0 ; AVX512_32-NEXT: vmovsd %xmm0, (%esp) ; AVX512_32-NEXT: fldl (%esp) ; AVX512_32-NEXT: movl %ebp, %esp @@ -137,7 +137,7 @@ define double @u32_to_d(i32 %a) nounwind { ; ; AVX512_64-LABEL: u32_to_d: ; AVX512_64: # %bb.0: -; AVX512_64-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; AVX512_64-NEXT: vcvtusi2sd %edi, %xmm15, %xmm0 ; AVX512_64-NEXT: retq ; ; SSE2_32-LABEL: u32_to_d: @@ -199,7 +199,7 @@ define double @s32_to_d(i32 %a) nounwind { ; AVX512_32-NEXT: movl %esp, %ebp ; AVX512_32-NEXT: andl $-8, %esp ; AVX512_32-NEXT: subl $8, %esp -; AVX512_32-NEXT: vcvtsi2sdl 8(%ebp), %xmm0, %xmm0 +; AVX512_32-NEXT: vcvtsi2sdl 8(%ebp), %xmm7, %xmm0 ; AVX512_32-NEXT: vmovsd %xmm0, (%esp) ; AVX512_32-NEXT: fldl (%esp) ; AVX512_32-NEXT: movl %ebp, %esp @@ -208,7 +208,7 @@ define double @s32_to_d(i32 %a) nounwind { ; ; AVX512_64-LABEL: s32_to_d: ; AVX512_64: # %bb.0: -; AVX512_64-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX512_64-NEXT: vcvtsi2sd %edi, %xmm15, %xmm0 ; AVX512_64-NEXT: retq ; ; SSE2_32-LABEL: s32_to_d: @@ -308,7 +308,7 @@ define float @u64_to_f(i64 %a) nounwind { ; ; AVX512_64-LABEL: u64_to_f: ; AVX512_64: # %bb.0: -; AVX512_64-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 +; AVX512_64-NEXT: vcvtusi2ss %rdi, %xmm15, %xmm0 ; AVX512_64-NEXT: retq ; ; AVX512DQ_32-LABEL: u64_to_f: @@ -437,7 +437,7 @@ define float @s64_to_f(i64 %a) nounwind { ; ; AVX512_64-LABEL: s64_to_f: ; AVX512_64: # %bb.0: -; AVX512_64-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX512_64-NEXT: vcvtsi2ss %rdi, %xmm15, %xmm0 ; AVX512_64-NEXT: retq ; ; AVX512DQ_32-LABEL: s64_to_f: @@ -502,7 +502,7 @@ define float @s64_to_f_2(i64 %a) nounwind { ; AVX512_64-LABEL: s64_to_f_2: ; AVX512_64: # %bb.0: ; AVX512_64-NEXT: addq $5, %rdi -; AVX512_64-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX512_64-NEXT: vcvtsi2ss %rdi, %xmm15, %xmm0 ; AVX512_64-NEXT: retq ; ; AVX512DQ_32-LABEL: s64_to_f_2: @@ -626,7 +626,7 @@ define double @u64_to_d(i64 %a) nounwind { ; ; AVX512_64-LABEL: u64_to_d: ; AVX512_64: # %bb.0: -; AVX512_64-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 +; AVX512_64-NEXT: vcvtusi2sd %rdi, %xmm15, %xmm0 ; AVX512_64-NEXT: retq ; ; AVX512DQ_32-LABEL: u64_to_d: @@ -748,7 +748,7 @@ define double @u64_to_d_optsize(i64 %a) nounwind optsize { ; ; AVX512_64-LABEL: u64_to_d_optsize: ; AVX512_64: # %bb.0: -; AVX512_64-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 +; AVX512_64-NEXT: vcvtusi2sd %rdi, %xmm15, %xmm0 ; AVX512_64-NEXT: retq ; ; AVX512DQ_32-LABEL: u64_to_d_optsize: @@ -869,7 +869,7 @@ define double @s64_to_d(i64 %a) nounwind { ; ; AVX512_64-LABEL: s64_to_d: ; AVX512_64: # %bb.0: -; AVX512_64-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX512_64-NEXT: vcvtsi2sd %rdi, %xmm15, %xmm0 ; AVX512_64-NEXT: retq ; ; AVX512DQ_32-LABEL: s64_to_d: @@ -955,7 +955,7 @@ define double @s64_to_d_2(i64 %a) nounwind { ; AVX512_64-LABEL: s64_to_d_2: ; AVX512_64: # %bb.0: ; AVX512_64-NEXT: addq $5, %rdi -; AVX512_64-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX512_64-NEXT: vcvtsi2sd %rdi, %xmm15, %xmm0 ; AVX512_64-NEXT: retq ; ; AVX512DQ_32-LABEL: s64_to_d_2: diff --git a/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll b/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll index 28b405799dfd0..b64bfb38c7a5a 100644 --- a/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll +++ b/llvm/test/CodeGen/X86/select-narrow-int-to-fp.ll @@ -17,13 +17,13 @@ define float @uint8ToFloat(i8 %int8) { ; CHECK-NO_FP16-LABEL: uint8ToFloat: ; CHECK-NO_FP16: # %bb.0: ; CHECK-NO_FP16-NEXT: movzbl %dil, %eax -; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-NO_FP16-NEXT: retq ; ; CHECK-WITH_FP16-LABEL: uint8ToFloat: ; CHECK-WITH_FP16: # %bb.0: ; CHECK-WITH_FP16-NEXT: movzbl %dil, %eax -; CHECK-WITH_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-WITH_FP16-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-WITH_FP16-NEXT: retq %fp32 = uitofp i8 %int8 to float ret float %fp32 @@ -62,14 +62,14 @@ define half @uint8ToHalf(i8 %int8) { ; CHECK-NO_FP16-LABEL: uint8ToHalf: ; CHECK-NO_FP16: # %bb.0: ; CHECK-NO_FP16-NEXT: movzbl %dil, %eax -; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO_FP16-NEXT: retq ; ; CHECK-WITH_FP16-LABEL: uint8ToHalf: ; CHECK-WITH_FP16: # %bb.0: ; CHECK-WITH_FP16-NEXT: movzbl %dil, %eax -; CHECK-WITH_FP16-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; CHECK-WITH_FP16-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; CHECK-WITH_FP16-NEXT: retq %fp32 = uitofp i8 %int8 to half ret half %fp32 @@ -111,13 +111,13 @@ define float @sint8ToFloat(i8 %int8) { ; CHECK-NO_FP16-LABEL: sint8ToFloat: ; CHECK-NO_FP16: # %bb.0: ; CHECK-NO_FP16-NEXT: movsbl %dil, %eax -; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-NO_FP16-NEXT: retq ; ; CHECK-WITH_FP16-LABEL: sint8ToFloat: ; CHECK-WITH_FP16: # %bb.0: ; CHECK-WITH_FP16-NEXT: movsbl %dil, %eax -; CHECK-WITH_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-WITH_FP16-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-WITH_FP16-NEXT: retq %fp32 = sitofp i8 %int8 to float ret float %fp32 @@ -143,14 +143,14 @@ define half @sint8ToHalf(i8 %int8) { ; CHECK-NO_FP16-LABEL: sint8ToHalf: ; CHECK-NO_FP16: # %bb.0: ; CHECK-NO_FP16-NEXT: movsbl %dil, %eax -; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO_FP16-NEXT: retq ; ; CHECK-WITH_FP16-LABEL: sint8ToHalf: ; CHECK-WITH_FP16: # %bb.0: ; CHECK-WITH_FP16-NEXT: movsbl %dil, %eax -; CHECK-WITH_FP16-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; CHECK-WITH_FP16-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; CHECK-WITH_FP16-NEXT: retq %fp32 = sitofp i8 %int8 to half ret half %fp32 @@ -184,13 +184,13 @@ define float @uint16ToFloat(i16 %int16) { ; CHECK-NO_FP16-LABEL: uint16ToFloat: ; CHECK-NO_FP16: # %bb.0: ; CHECK-NO_FP16-NEXT: movzwl %di, %eax -; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-NO_FP16-NEXT: retq ; ; CHECK-WITH_FP16-LABEL: uint16ToFloat: ; CHECK-WITH_FP16: # %bb.0: ; CHECK-WITH_FP16-NEXT: movzwl %di, %eax -; CHECK-WITH_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-WITH_FP16-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-WITH_FP16-NEXT: retq %fp32 = uitofp i16 %int16 to float ret float %fp32 @@ -216,14 +216,14 @@ define half @uint16ToHalf(i16 %int16) { ; CHECK-NO_FP16-LABEL: uint16ToHalf: ; CHECK-NO_FP16: # %bb.0: ; CHECK-NO_FP16-NEXT: movzwl %di, %eax -; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO_FP16-NEXT: retq ; ; CHECK-WITH_FP16-LABEL: uint16ToHalf: ; CHECK-WITH_FP16: # %bb.0: ; CHECK-WITH_FP16-NEXT: movzwl %di, %eax -; CHECK-WITH_FP16-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; CHECK-WITH_FP16-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; CHECK-WITH_FP16-NEXT: retq %fp32 = uitofp i16 %int16 to half ret half %fp32 @@ -249,13 +249,13 @@ define float @sint16ToFloat(i16 %int16) { ; CHECK-NO_FP16-LABEL: sint16ToFloat: ; CHECK-NO_FP16: # %bb.0: ; CHECK-NO_FP16-NEXT: movswl %di, %eax -; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-NO_FP16-NEXT: retq ; ; CHECK-WITH_FP16-LABEL: sint16ToFloat: ; CHECK-WITH_FP16: # %bb.0: ; CHECK-WITH_FP16-NEXT: movswl %di, %eax -; CHECK-WITH_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-WITH_FP16-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-WITH_FP16-NEXT: retq %fp32 = sitofp i16 %int16 to float ret float %fp32 @@ -281,14 +281,14 @@ define half @sint16ToHalf(i16 %int16) { ; CHECK-NO_FP16-LABEL: sint16ToHalf: ; CHECK-NO_FP16: # %bb.0: ; CHECK-NO_FP16-NEXT: movswl %di, %eax -; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; CHECK-NO_FP16-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; CHECK-NO_FP16-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; CHECK-NO_FP16-NEXT: retq ; ; CHECK-WITH_FP16-LABEL: sint16ToHalf: ; CHECK-WITH_FP16: # %bb.0: ; CHECK-WITH_FP16-NEXT: movswl %di, %eax -; CHECK-WITH_FP16-NEXT: vcvtsi2sh %eax, %xmm0, %xmm0 +; CHECK-WITH_FP16-NEXT: vcvtsi2sh %eax, %xmm31, %xmm0 ; CHECK-WITH_FP16-NEXT: retq %fp32 = sitofp i16 %int16 to half ret half %fp32 diff --git a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll index 8a6c2f851a6d6..c8e31f7088a45 100644 --- a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll +++ b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll @@ -1,21 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 define <4 x i64> @autogen_SD88863() { -; CHECK-LABEL: autogen_SD88863: -; CHECK: # %bb.0: # %BB -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3] -; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB0_1: # %CF -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_1 -; CHECK-NEXT: # %bb.2: # %CF240 -; CHECK-NEXT: ret{{[l|q]}} +; X86-LABEL: autogen_SD88863: +; X86: # %bb.0: # %BB +; X86-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm7[0,1] +; X86-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X86-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3] +; X86-NEXT: movb $1, %al +; X86-NEXT: .p2align 4 +; X86-NEXT: .LBB0_1: # %CF +; X86-NEXT: # =>This Inner Loop Header: Depth=1 +; X86-NEXT: testb %al, %al +; X86-NEXT: jne .LBB0_1 +; X86-NEXT: # %bb.2: # %CF240 +; X86-NEXT: retl +; +; X64-LABEL: autogen_SD88863: +; X64: # %bb.0: # %BB +; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm15[0,1] +; X64-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X64-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3] +; X64-NEXT: movb $1, %al +; X64-NEXT: .p2align 4 +; X64-NEXT: .LBB0_1: # %CF +; X64-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NEXT: testb %al, %al +; X64-NEXT: jne .LBB0_1 +; X64-NEXT: # %bb.2: # %CF240 +; X64-NEXT: retq BB: %I26 = insertelement <4 x i64> undef, i64 undef, i32 2 br label %CF diff --git a/llvm/test/CodeGen/X86/sse-cvttp2si.ll b/llvm/test/CodeGen/X86/sse-cvttp2si.ll index d08cf120bb4b6..09b1d0f8b87db 100644 --- a/llvm/test/CodeGen/X86/sse-cvttp2si.ll +++ b/llvm/test/CodeGen/X86/sse-cvttp2si.ll @@ -23,7 +23,7 @@ define float @float_to_int_to_float_mem_f32_i32(ptr %p) #0 { ; AVX-LABEL: float_to_int_to_float_mem_f32_i32: ; AVX: # %bb.0: ; AVX-NEXT: vcvttss2si (%rdi), %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-NEXT: retq %x = load <4 x float>, ptr %p, align 16 %fptosi = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %x) @@ -42,7 +42,7 @@ define float @float_to_int_to_float_reg_f32_i32(<4 x float> %x) #0 { ; AVX-LABEL: float_to_int_to_float_reg_f32_i32: ; AVX: # %bb.0: ; AVX-NEXT: vcvttss2si %xmm0, %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm0 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-NEXT: retq %fptosi = tail call i32 @llvm.x86.sse.cvttss2si(<4 x float> %x) %sitofp = sitofp i32 %fptosi to float @@ -59,7 +59,7 @@ define float @float_to_int_to_float_mem_f32_i64(ptr %p) #0 { ; AVX-LABEL: float_to_int_to_float_mem_f32_i64: ; AVX: # %bb.0: ; AVX-NEXT: vcvttss2si (%rdi), %rax -; AVX-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX-NEXT: retq %x = load <4 x float>, ptr %p, align 16 %fptosi = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %x) @@ -78,7 +78,7 @@ define float @float_to_int_to_float_reg_f32_i64(<4 x float> %x) #0 { ; AVX-LABEL: float_to_int_to_float_reg_f32_i64: ; AVX: # %bb.0: ; AVX-NEXT: vcvttss2si %xmm0, %rax -; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm0 +; AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX-NEXT: retq %fptosi = tail call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %x) %sitofp = sitofp i64 %fptosi to float @@ -95,7 +95,7 @@ define double @float_to_int_to_float_mem_f64_i32(ptr %p) #0 { ; AVX-LABEL: float_to_int_to_float_mem_f64_i32: ; AVX: # %bb.0: ; AVX-NEXT: vcvttsd2si (%rdi), %eax -; AVX-NEXT: vcvtsi2sd %eax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 ; AVX-NEXT: retq %x = load <2 x double>, ptr %p, align 16 %fptosi = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %x) @@ -114,7 +114,7 @@ define double @float_to_int_to_float_reg_f64_i32(<2 x double> %x) #0 { ; AVX-LABEL: float_to_int_to_float_reg_f64_i32: ; AVX: # %bb.0: ; AVX-NEXT: vcvttsd2si %xmm0, %eax -; AVX-NEXT: vcvtsi2sd %eax, %xmm1, %xmm0 +; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 ; AVX-NEXT: retq %fptosi = tail call i32 @llvm.x86.sse2.cvttsd2si(<2 x double> %x) %sitofp = sitofp i32 %fptosi to double @@ -131,7 +131,7 @@ define double @float_to_int_to_float_mem_f64_i64(ptr %p) #0 { ; AVX-LABEL: float_to_int_to_float_mem_f64_i64: ; AVX: # %bb.0: ; AVX-NEXT: vcvttsd2si (%rdi), %rax -; AVX-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX-NEXT: retq %x = load <2 x double>, ptr %p, align 16 %fptosi = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %x) @@ -150,7 +150,7 @@ define double @float_to_int_to_float_reg_f64_i64(<2 x double> %x) #0 { ; AVX-LABEL: float_to_int_to_float_reg_f64_i64: ; AVX: # %bb.0: ; AVX-NEXT: vcvttsd2si %xmm0, %rax -; AVX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0 +; AVX-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX-NEXT: retq %fptosi = tail call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %x) %sitofp = sitofp i64 %fptosi to double diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll index f6b0df153c260..6dd75c8c09ce5 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -782,7 +782,7 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load_optsize(<2 x double> %a0, ptr % ; X86-AVX1-LABEL: test_x86_sse2_cvtss2sd_load_optsize: ; X86-AVX1: ## %bb.0: ; X86-AVX1-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX1-NEXT: vcvtss2sd (%eax), %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0x08] +; X86-AVX1-NEXT: vcvtss2sd (%eax), %xmm7, %xmm1 ## encoding: [0xc5,0xc2,0x5a,0x08] ; X86-AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1] ; X86-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1] ; X86-AVX1-NEXT: retl ## encoding: [0xc3] @@ -790,7 +790,7 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load_optsize(<2 x double> %a0, ptr % ; X86-AVX512-LABEL: test_x86_sse2_cvtss2sd_load_optsize: ; X86-AVX512: ## %bb.0: ; X86-AVX512-NEXT: movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04] -; X86-AVX512-NEXT: vcvtss2sd (%eax), %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0x08] +; X86-AVX512-NEXT: vcvtss2sd (%eax), %xmm7, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xc2,0x5a,0x08] ; X86-AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1] ; X86-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1] ; X86-AVX512-NEXT: retl ## encoding: [0xc3] @@ -804,14 +804,14 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load_optsize(<2 x double> %a0, ptr % ; ; X64-AVX1-LABEL: test_x86_sse2_cvtss2sd_load_optsize: ; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vcvtss2sd (%rdi), %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0x0f] +; X64-AVX1-NEXT: vcvtss2sd (%rdi), %xmm15, %xmm1 ## encoding: [0xc5,0x82,0x5a,0x0f] ; X64-AVX1-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfb,0x10,0xc1] ; X64-AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1] ; X64-AVX1-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512-LABEL: test_x86_sse2_cvtss2sd_load_optsize: ; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vcvtss2sd (%rdi), %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0x0f] +; X64-AVX512-NEXT: vcvtss2sd (%rdi), %xmm15, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0x82,0x5a,0x0f] ; X64-AVX512-NEXT: vmovsd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0xc1] ; X64-AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1] ; X64-AVX512-NEXT: retq ## encoding: [0xc3] diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll index 6625cc4f07a27..d7404c9e7c7da 100644 --- a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll +++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll @@ -990,7 +990,7 @@ define double @stack_fold_cvtsi2sd(i32 %a0) { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: popq %r12 @@ -1034,7 +1034,7 @@ define <2 x double> @stack_fold_cvtsi2sd_int(i32 %a0) { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: vcvtsi2sdl {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 @@ -1080,7 +1080,7 @@ define double @stack_fold_cvtsi642sd(i64 %a0) { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload +; CHECK-NEXT: vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: popq %r12 @@ -1124,7 +1124,7 @@ define <2 x double> @stack_fold_cvtsi642sd_int(i64 %a0) { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload +; CHECK-NEXT: vcvtsi2sdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 @@ -1170,7 +1170,7 @@ define float @stack_fold_cvtsi2ss(i32 %a0) { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: popq %r12 @@ -1214,7 +1214,7 @@ define <4 x float> @stack_fold_cvtsi2ss_int(i32 %a0) { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: vcvtsi2ssl {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; CHECK-NEXT: popq %rbx @@ -1261,7 +1261,7 @@ define float @stack_fold_cvtsi642ss(i64 %a0) { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload +; CHECK-NEXT: vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: popq %rbx ; CHECK-NEXT: .cfi_def_cfa_offset 48 ; CHECK-NEXT: popq %r12 @@ -1305,7 +1305,7 @@ define <4 x float> @stack_fold_cvtsi642ss_int(i64 %a0) { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm0 # 8-byte Folded Reload +; CHECK-NEXT: vcvtsi2ssq {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; CHECK-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; CHECK-NEXT: popq %rbx @@ -2861,8 +2861,8 @@ define double @stack_fold_roundsd(double %a0) optsize { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vroundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload +; CHECK-NEXT: vxorps %xmm15, %xmm15, %xmm15 +; CHECK-NEXT: vroundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call double @llvm.floor.f64(double %a0) @@ -2876,7 +2876,7 @@ define double @stack_fold_roundsd_minsize(double %a0) minsize { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vroundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload +; CHECK-NEXT: vroundsd $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call double @llvm.floor.f64(double %a0) @@ -2908,8 +2908,8 @@ define float @stack_fold_roundss(float %a0) optsize { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vroundss $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: vxorps %xmm15, %xmm15, %xmm15 +; CHECK-NEXT: vroundss $9, {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call float @llvm.floor.f32(float %a0) @@ -3106,8 +3106,8 @@ define double @stack_fold_sqrtsd(double %a0) optsize { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vsqrtsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 8-byte Folded Reload +; CHECK-NEXT: vxorps %xmm15, %xmm15, %xmm15 +; CHECK-NEXT: vsqrtsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 8-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call double @llvm.sqrt.f64(double %a0) @@ -3124,8 +3124,8 @@ define float @stack_fold_sqrtss(float %a0) optsize { ; CHECK-NEXT: #APP ; CHECK-NEXT: nop ; CHECK-NEXT: #NO_APP -; CHECK-NEXT: vxorps %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vsqrtss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 4-byte Folded Reload +; CHECK-NEXT: vxorps %xmm15, %xmm15, %xmm15 +; CHECK-NEXT: vsqrtss {{[-0-9]+}}(%r{{[sb]}}p), %xmm15, %xmm0 # 4-byte Folded Reload ; CHECK-NEXT: retq %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"() %2 = call float @llvm.sqrt.f32(float %a0) diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll index 7c788d291a5c7..cd4ceca6716b1 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-128.ll @@ -227,9 +227,9 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; AVX-64-LABEL: sitofp_v2i64_v2f32: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX-64-NEXT: vmovq %xmm0, %rax -; AVX-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX-64-NEXT: retq ; @@ -246,9 +246,9 @@ define <2 x float> @sitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; AVX512DQ-64-LABEL: sitofp_v2i64_v2f32: ; AVX512DQ-64: # %bb.0: ; AVX512DQ-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512DQ-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX512DQ-64-NEXT: vmovq %xmm0, %rax -; AVX512DQ-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX512DQ-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX512DQ-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX512DQ-64-NEXT: retq ; @@ -439,9 +439,9 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; AVX1-64-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-64-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 ; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX1-64-NEXT: vmovq %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero ; AVX1-64-NEXT: vaddps %xmm1, %xmm1, %xmm2 ; AVX1-64-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -453,18 +453,18 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; AVX512F-64-LABEL: uitofp_v2i64_v2f32: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512F-64-NEXT: vmovq %xmm0, %rax -; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX512F-64-NEXT: retq ; ; AVX512VL-64-LABEL: uitofp_v2i64_v2f32: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax -; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX512VL-64-NEXT: retq ; @@ -481,9 +481,9 @@ define <2 x float> @uitofp_v2i64_v2f32(<2 x i64> %x) #0 { ; AVX512DQ-64-LABEL: uitofp_v2i64_v2f32: ; AVX512DQ-64: # %bb.0: ; AVX512DQ-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512DQ-64-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512DQ-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512DQ-64-NEXT: vmovq %xmm0, %rax -; AVX512DQ-64-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512DQ-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512DQ-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX512DQ-64-NEXT: retq ; @@ -1237,9 +1237,9 @@ define <2 x double> @sitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; AVX-64-LABEL: sitofp_v2i64_v2f64: ; AVX-64: # %bb.0: ; AVX-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX-64-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX-64-NEXT: vmovq %xmm0, %rax -; AVX-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX-64-NEXT: retq ; @@ -1439,7 +1439,7 @@ define <2 x double> @uitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; AVX1-64-NEXT: orq %rcx, %rdx ; AVX1-64-NEXT: testq %rax, %rax ; AVX1-64-NEXT: cmovnsq %rax, %rdx -; AVX1-64-NEXT: vcvtsi2sd %rdx, %xmm1, %xmm1 +; AVX1-64-NEXT: vcvtsi2sd %rdx, %xmm15, %xmm1 ; AVX1-64-NEXT: jns .LBB21_2 ; AVX1-64-NEXT: # %bb.1: ; AVX1-64-NEXT: vaddsd %xmm1, %xmm1, %xmm1 @@ -1452,7 +1452,7 @@ define <2 x double> @uitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; AVX1-64-NEXT: orq %rcx, %rdx ; AVX1-64-NEXT: testq %rax, %rax ; AVX1-64-NEXT: cmovnsq %rax, %rdx -; AVX1-64-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm0 +; AVX1-64-NEXT: vcvtsi2sd %rdx, %xmm15, %xmm0 ; AVX1-64-NEXT: jns .LBB21_4 ; AVX1-64-NEXT: # %bb.3: ; AVX1-64-NEXT: vaddsd %xmm0, %xmm0, %xmm0 @@ -1463,18 +1463,18 @@ define <2 x double> @uitofp_v2i64_v2f64(<2 x i64> %x) #0 { ; AVX512F-64-LABEL: uitofp_v2i64_v2f64: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1 +; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm1 ; AVX512F-64-NEXT: vmovq %xmm0, %rax -; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0 +; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 ; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-64-NEXT: retq ; ; AVX512VL-64-LABEL: uitofp_v2i64_v2f64: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1 +; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm1 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax -; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0 +; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 ; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll index a336d0a01fa7b..f790377f3331a 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-256.ll @@ -670,14 +670,14 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX1-64: # %bb.0: ; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX1-64-NEXT: vmovq %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX1-64-NEXT: vmovq %xmm0, %rax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-64-NEXT: retq @@ -686,14 +686,14 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX2-64-NEXT: vmovq %xmm1, %rax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX2-64-NEXT: vmovq %xmm0, %rax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-64-NEXT: retq @@ -702,14 +702,14 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512F-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX512F-64-NEXT: vmovq %xmm1, %rax -; AVX512F-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX512F-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512F-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX512F-64-NEXT: vmovq %xmm0, %rax -; AVX512F-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512F-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX512F-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-64-NEXT: retq @@ -718,14 +718,14 @@ define <4 x double> @sitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512VL-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX512VL-64-NEXT: vmovq %xmm1, %rax -; AVX512VL-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX512VL-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512VL-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax -; AVX512VL-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512VL-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX512VL-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-64-NEXT: retq @@ -802,26 +802,26 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX1-64: # %bb.0: ; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-64-NEXT: vpextrd $2, %xmm1, %eax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX1-64-NEXT: vmovd %xmm1, %eax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-64-NEXT: vextractps $2, %xmm0, %eax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; AVX1-64-NEXT: vmovq %xmm0, %rax ; AVX1-64-NEXT: movl %eax, %eax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm4 ; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX1-64-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-64-NEXT: vpextrd $3, %xmm1, %eax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; AVX1-64-NEXT: vpextrd $1, %xmm1, %eax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm1 +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX1-64-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; AVX1-64-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0 +; AVX1-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX1-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX1-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-64-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -832,28 +832,28 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX2-64-NEXT: vextractps $3, %xmm1, %eax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX2-64-NEXT: vextractps $1, %xmm1, %eax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX2-64-NEXT: vextractps $3, %xmm0, %eax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; AVX2-64-NEXT: vextractps $1, %xmm0, %eax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm4 ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX2-64-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX2-64-NEXT: vbroadcastsd {{.*#+}} ymm3 = [4.294967296E+9,4.294967296E+9,4.294967296E+9,4.294967296E+9] ; AVX2-64-NEXT: vmulpd %ymm3, %ymm2, %ymm2 ; AVX2-64-NEXT: vextractps $2, %xmm1, %eax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; AVX2-64-NEXT: vmovd %xmm1, %eax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm1 +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX2-64-NEXT: vextractps $2, %xmm0, %eax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; AVX2-64-NEXT: vmovq %xmm0, %rax ; AVX2-64-NEXT: movl %eax, %eax -; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0 +; AVX2-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX2-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX2-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-64-NEXT: vaddpd %ymm0, %ymm2, %ymm0 @@ -863,14 +863,14 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm2 ; AVX512F-64-NEXT: vmovq %xmm1, %rax -; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 +; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm1 ; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2 +; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm2 ; AVX512F-64-NEXT: vmovq %xmm0, %rax -; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 +; AVX512F-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 ; AVX512F-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX512F-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-64-NEXT: retq @@ -879,14 +879,14 @@ define <4 x double> @uitofp_v4i64_v4f64(<4 x i64> %x) #0 { ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm2 ; AVX512VL-64-NEXT: vmovq %xmm1, %rax -; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 +; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm1 ; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2 +; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm2 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax -; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 +; AVX512VL-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 ; AVX512VL-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX512VL-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-64-NEXT: retq @@ -947,16 +947,16 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX1-64-LABEL: sitofp_v4i64_v4f32: ; AVX1-64: # %bb.0: ; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX1-64-NEXT: vmovq %xmm0, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-64-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-64-NEXT: vmovq %xmm0, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX1-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-64-NEXT: vzeroupper ; AVX1-64-NEXT: retq @@ -964,16 +964,16 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX2-64-LABEL: sitofp_v4i64_v4f32: ; AVX2-64: # %bb.0: ; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX2-64-NEXT: vmovq %xmm0, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-64-NEXT: vmovq %xmm0, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX2-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX2-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-64-NEXT: vzeroupper ; AVX2-64-NEXT: retq @@ -981,16 +981,16 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX512F-64-LABEL: sitofp_v4i64_v4f32: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX512F-64-NEXT: vmovq %xmm0, %rax -; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-64-NEXT: vmovq %xmm0, %rax -; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512F-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-64-NEXT: vzeroupper ; AVX512F-64-NEXT: retq @@ -998,16 +998,16 @@ define <4 x float> @sitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX512VL-64-LABEL: sitofp_v4i64_v4f32: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax -; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax -; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512VL-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512VL-64-NEXT: vzeroupper ; AVX512VL-64-NEXT: retq @@ -1092,16 +1092,16 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX1-64-NEXT: vorpd %ymm3, %ymm1, %ymm1 ; AVX1-64-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 ; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX1-64-NEXT: vmovq %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; AVX1-64-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-64-NEXT: vmovq %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX1-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; AVX1-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; AVX1-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX1-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] ; AVX1-64-NEXT: vaddps %xmm1, %xmm1, %xmm3 ; AVX1-64-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 @@ -1117,16 +1117,16 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX2-64-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-64-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 ; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX2-64-NEXT: vmovq %xmm1, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX2-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX2-64-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-64-NEXT: vmovq %xmm1, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX2-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX2-64-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 +; AVX2-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX2-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] ; AVX2-64-NEXT: vaddps %xmm1, %xmm1, %xmm2 ; AVX2-64-NEXT: vextracti128 $1, %ymm0, %xmm3 @@ -1138,16 +1138,16 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX512F-64-LABEL: uitofp_v4i64_v4f32: ; AVX512F-64: # %bb.0: ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512F-64-NEXT: vmovq %xmm0, %rax -; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-64-NEXT: vmovq %xmm0, %rax -; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX512F-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512F-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512F-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-64-NEXT: vzeroupper ; AVX512F-64-NEXT: retq @@ -1155,16 +1155,16 @@ define <4 x float> @uitofp_v4i64_v4f32(<4 x i64> %x) #0 { ; AVX512VL-64-LABEL: uitofp_v4i64_v4f32: ; AVX512VL-64: # %bb.0: ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax -; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-64-NEXT: vmovq %xmm0, %rax -; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX512VL-64-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512VL-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512VL-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512VL-64-NEXT: vzeroupper ; AVX512VL-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll index 0cf945202a2d4..59294dd17fbca 100644 --- a/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll +++ b/llvm/test/CodeGen/X86/vec-strict-inttofp-512.ll @@ -323,27 +323,27 @@ define <8 x double> @sitofp_v8i64_v8f64(<8 x i64> %x) #0 { ; NODQ-64: # %bb.0: ; NODQ-64-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; NODQ-64-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; NODQ-64-NEXT: vmovq %xmm1, %rax -; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; NODQ-64-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 +; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-64-NEXT: vmovq %xmm2, %rax -; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-64-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm2 ; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-64-NEXT: vmovq %xmm2, %rax -; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm2 +; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; NODQ-64-NEXT: vmovq %xmm0, %rax -; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm4, %xmm0 +; NODQ-64-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; NODQ-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 @@ -452,27 +452,27 @@ define <8 x double> @uitofp_v8i64_v8f64(<8 x i64> %x) #0 { ; NODQ-64: # %bb.0: ; NODQ-64-NEXT: vextracti32x4 $3, %zmm0, %xmm1 ; NODQ-64-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm2 ; NODQ-64-NEXT: vmovq %xmm1, %rax -; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm1 ; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; NODQ-64-NEXT: vextracti32x4 $2, %zmm0, %xmm2 ; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm3, %xmm3 +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm3 ; NODQ-64-NEXT: vmovq %xmm2, %rax -; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm2 +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm2 ; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-64-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm2 ; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm3 +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm3 ; NODQ-64-NEXT: vmovq %xmm2, %rax -; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm2 +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm2 ; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm3 +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm3 ; NODQ-64-NEXT: vmovq %xmm0, %rax -; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm4, %xmm0 +; NODQ-64-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 ; NODQ-64-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; NODQ-64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 ; NODQ-64-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 @@ -548,28 +548,28 @@ define <8 x float> @sitofp_v8i64_v8f32(<8 x i64> %x) #0 { ; NODQ-64: # %bb.0: ; NODQ-64-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; NODQ-64-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; NODQ-64-NEXT: vmovq %xmm1, %rax -; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; NODQ-64-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NODQ-64-NEXT: vmovq %xmm2, %rax -; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm2 +; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; NODQ-64-NEXT: vmovq %xmm0, %rax -; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-64-NEXT: vmovq %xmm0, %rax -; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm4, %xmm0 +; NODQ-64-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; NODQ-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-64-NEXT: retq @@ -675,28 +675,28 @@ define <8 x float> @uitofp_v8i64_v8f32(<8 x i64> %x) #0 { ; NODQ-64: # %bb.0: ; NODQ-64-NEXT: vextracti32x4 $2, %zmm0, %xmm1 ; NODQ-64-NEXT: vpextrq $1, %xmm1, %rax -; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; NODQ-64-NEXT: vmovq %xmm1, %rax -; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm1 +; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; NODQ-64-NEXT: vextracti32x4 $3, %zmm0, %xmm2 ; NODQ-64-NEXT: vmovq %xmm2, %rax -; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm3, %xmm3 +; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm3 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] ; NODQ-64-NEXT: vpextrq $1, %xmm2, %rax -; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm2 +; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; NODQ-64-NEXT: vmovq %xmm0, %rax -; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 +; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm3 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; NODQ-64-NEXT: vextracti128 $1, %ymm0, %xmm0 ; NODQ-64-NEXT: vmovq %xmm0, %rax -; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm3 +; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm3 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; NODQ-64-NEXT: vpextrq $1, %xmm0, %rax -; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm4, %xmm0 +; NODQ-64-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; NODQ-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; NODQ-64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; NODQ-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll index af841cf38b24a..62ab5d82bfbb6 100644 --- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll +++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll @@ -126,27 +126,27 @@ define <2 x double> @sitofp_2i64_to_2f64(<2 x i64> %a) { ; VEX-LABEL: sitofp_2i64_to_2f64: ; VEX: # %bb.0: ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; VEX-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; VEX-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_2i64_to_2f64: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_2i64_to_2f64: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512VL-NEXT: retq ; @@ -352,14 +352,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -368,14 +368,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX2: # %bb.0: ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX2-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX2-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX2-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX2-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -384,14 +384,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX512F: # %bb.0: ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -400,14 +400,14 @@ define <4 x double> @sitofp_4i64_to_4f64(<4 x i64> %a) { ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX512VL-NEXT: vmovq %xmm1, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX512VL-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq @@ -1247,27 +1247,27 @@ define <4 x float> @sitofp_2i64_to_4f32(<2 x i64> %a) { ; VEX-LABEL: sitofp_2i64_to_4f32: ; VEX: # %bb.0: ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VEX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; VEX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_2i64_to_4f32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_2i64_to_4f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; AVX512VL-NEXT: retq ; @@ -1316,27 +1316,27 @@ define <4 x float> @sitofp_2i64_to_4f32_zero(<2 x i64> %a) { ; VEX-LABEL: sitofp_2i64_to_4f32_zero: ; VEX: # %bb.0: ; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VEX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; VEX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_2i64_to_4f32_zero: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_2i64_to_4f32_zero: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512VL-NEXT: retq ; @@ -1383,27 +1383,27 @@ define <4 x float> @sitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; VEX-LABEL: sitofp_4i64_to_4f32_undef: ; VEX: # %bb.0: ; VEX-NEXT: vpextrq $1, %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; VEX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; VEX-NEXT: vmovq %xmm0, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; VEX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_4i64_to_4f32_undef: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_4i64_to_4f32_undef: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; AVX512VL-NEXT: retq ; @@ -1581,16 +1581,16 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX1-LABEL: sitofp_4i64_to_4f32: ; AVX1: # %bb.0: ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -1598,16 +1598,16 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX2-LABEL: sitofp_4i64_to_4f32: ; AVX2: # %bb.0: ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX2-NEXT: vpextrq $1, %xmm0, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -1615,16 +1615,16 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX512F-LABEL: sitofp_4i64_to_4f32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -1632,16 +1632,16 @@ define <4 x float> @sitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX512VL-LABEL: sitofp_4i64_to_4f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -1892,9 +1892,9 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; VEX-NEXT: vpor %xmm1, %xmm2, %xmm1 ; VEX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 ; VEX-NEXT: vpextrq $1, %xmm1, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; VEX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; VEX-NEXT: vmovq %xmm1, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; VEX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero ; VEX-NEXT: vaddps %xmm1, %xmm1, %xmm2 ; VEX-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -1906,18 +1906,18 @@ define <4 x float> @uitofp_2i64_to_4f32(<2 x i64> %a) { ; AVX512F-LABEL: uitofp_2i64_to_4f32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_2i64_to_4f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; AVX512VL-NEXT: retq ; @@ -2007,9 +2007,9 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { ; VEX-NEXT: vpor %xmm1, %xmm2, %xmm1 ; VEX-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 ; VEX-NEXT: vpextrq $1, %xmm1, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; VEX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; VEX-NEXT: vmovq %xmm1, %rax -; VEX-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; VEX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero ; VEX-NEXT: vaddps %xmm1, %xmm1, %xmm2 ; VEX-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -2022,18 +2022,18 @@ define <4 x float> @uitofp_2i64_to_2f32(<2 x i64> %a) { ; AVX512F-LABEL: uitofp_2i64_to_2f32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_2i64_to_2f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],zero,zero ; AVX512VL-NEXT: retq ; @@ -2125,9 +2125,9 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; AVX1-NEXT: vmovaps %xmm0, %xmm2 ; AVX1-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] ; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] @@ -2148,16 +2148,16 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] ; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 @@ -2168,18 +2168,18 @@ define <4 x float> @uitofp_4i64_to_4f32_undef(<2 x i64> %a) { ; AVX512F-LABEL: uitofp_4i64_to_4f32_undef: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_4i64_to_4f32_undef: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],zero,zero ; AVX512VL-NEXT: retq ; @@ -2494,16 +2494,16 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] ; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 @@ -2519,16 +2519,16 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] ; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 @@ -2540,16 +2540,16 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX512F-LABEL: uitofp_4i64_to_4f32: ; AVX512F: # %bb.0: ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -2557,16 +2557,16 @@ define <4 x float> @uitofp_4i64_to_4f32(<4 x i64> %a) { ; AVX512VL-LABEL: uitofp_4i64_to_4f32: ; AVX512VL: # %bb.0: ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512VL-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-NEXT: vmovq %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX512VL-NEXT: vpextrq $1, %xmm0, %rax -; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512VL-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512VL-NEXT: vzeroupper ; AVX512VL-NEXT: retq @@ -2836,22 +2836,22 @@ define <2 x double> @sitofp_load_2i64_to_2f64(ptr%a) { ; ; VEX-LABEL: sitofp_load_2i64_to_2f64: ; VEX: # %bb.0: -; VEX-NEXT: vcvtsi2sdq 8(%rdi), %xmm0, %xmm0 -; VEX-NEXT: vcvtsi2sdq (%rdi), %xmm1, %xmm1 +; VEX-NEXT: vcvtsi2sdq 8(%rdi), %xmm15, %xmm0 +; VEX-NEXT: vcvtsi2sdq (%rdi), %xmm15, %xmm1 ; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_load_2i64_to_2f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvtsi2sdq 8(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vcvtsi2sdq (%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vcvtsi2sdq 8(%rdi), %xmm15, %xmm0 +; AVX512F-NEXT: vcvtsi2sdq (%rdi), %xmm15, %xmm1 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_load_2i64_to_2f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtsi2sdq 8(%rdi), %xmm0, %xmm0 -; AVX512VL-NEXT: vcvtsi2sdq (%rdi), %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtsi2sdq 8(%rdi), %xmm15, %xmm0 +; AVX512VL-NEXT: vcvtsi2sdq (%rdi), %xmm15, %xmm1 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] ; AVX512VL-NEXT: retq ; @@ -3011,33 +3011,33 @@ define <4 x double> @sitofp_load_4i64_to_4f64(ptr%a) { ; ; VEX-LABEL: sitofp_load_4i64_to_4f64: ; VEX: # %bb.0: -; VEX-NEXT: vcvtsi2sdq 24(%rdi), %xmm0, %xmm0 -; VEX-NEXT: vcvtsi2sdq 16(%rdi), %xmm1, %xmm1 +; VEX-NEXT: vcvtsi2sdq 24(%rdi), %xmm15, %xmm0 +; VEX-NEXT: vcvtsi2sdq 16(%rdi), %xmm15, %xmm1 ; VEX-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; VEX-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm1 -; VEX-NEXT: vcvtsi2sdq (%rdi), %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2sdq 8(%rdi), %xmm15, %xmm1 +; VEX-NEXT: vcvtsi2sdq (%rdi), %xmm15, %xmm2 ; VEX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; VEX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_load_4i64_to_4f64: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvtsi2sdq 24(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vcvtsi2sdq 16(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vcvtsi2sdq 24(%rdi), %xmm15, %xmm0 +; AVX512F-NEXT: vcvtsi2sdq 16(%rdi), %xmm15, %xmm1 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512F-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm1 -; AVX512F-NEXT: vcvtsi2sdq (%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2sdq 8(%rdi), %xmm15, %xmm1 +; AVX512F-NEXT: vcvtsi2sdq (%rdi), %xmm15, %xmm2 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_load_4i64_to_4f64: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtsi2sdq 24(%rdi), %xmm0, %xmm0 -; AVX512VL-NEXT: vcvtsi2sdq 16(%rdi), %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtsi2sdq 24(%rdi), %xmm15, %xmm0 +; AVX512VL-NEXT: vcvtsi2sdq 16(%rdi), %xmm15, %xmm1 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; AVX512VL-NEXT: vcvtsi2sdq 8(%rdi), %xmm2, %xmm1 -; AVX512VL-NEXT: vcvtsi2sdq (%rdi), %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2sdq 8(%rdi), %xmm15, %xmm1 +; AVX512VL-NEXT: vcvtsi2sdq (%rdi), %xmm15, %xmm2 ; AVX512VL-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq @@ -3776,34 +3776,34 @@ define <4 x float> @sitofp_load_4i64_to_4f32(ptr%a) { ; ; VEX-LABEL: sitofp_load_4i64_to_4f32: ; VEX: # %bb.0: -; VEX-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0 -; VEX-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1 +; VEX-NEXT: vcvtsi2ssq 8(%rdi), %xmm15, %xmm0 +; VEX-NEXT: vcvtsi2ssq (%rdi), %xmm15, %xmm1 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; VEX-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1 +; VEX-NEXT: vcvtsi2ssq 16(%rdi), %xmm15, %xmm1 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; VEX-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1 +; VEX-NEXT: vcvtsi2ssq 24(%rdi), %xmm15, %xmm1 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_load_4i64_to_4f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vcvtsi2ssq 8(%rdi), %xmm15, %xmm0 +; AVX512F-NEXT: vcvtsi2ssq (%rdi), %xmm15, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512F-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1 +; AVX512F-NEXT: vcvtsi2ssq 16(%rdi), %xmm15, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1 +; AVX512F-NEXT: vcvtsi2ssq 24(%rdi), %xmm15, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_load_4i64_to_4f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtsi2ssq 8(%rdi), %xmm0, %xmm0 -; AVX512VL-NEXT: vcvtsi2ssq (%rdi), %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtsi2ssq 8(%rdi), %xmm15, %xmm0 +; AVX512VL-NEXT: vcvtsi2ssq (%rdi), %xmm15, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512VL-NEXT: vcvtsi2ssq 16(%rdi), %xmm2, %xmm1 +; AVX512VL-NEXT: vcvtsi2ssq 16(%rdi), %xmm15, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vcvtsi2ssq 24(%rdi), %xmm2, %xmm1 +; AVX512VL-NEXT: vcvtsi2ssq 24(%rdi), %xmm15, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX512VL-NEXT: retq ; @@ -3938,57 +3938,57 @@ define <8 x float> @sitofp_load_8i64_to_8f32(ptr%a) { ; ; VEX-LABEL: sitofp_load_8i64_to_8f32: ; VEX: # %bb.0: -; VEX-NEXT: vcvtsi2ssq 40(%rdi), %xmm0, %xmm0 -; VEX-NEXT: vcvtsi2ssq 32(%rdi), %xmm1, %xmm1 +; VEX-NEXT: vcvtsi2ssq 40(%rdi), %xmm15, %xmm0 +; VEX-NEXT: vcvtsi2ssq 32(%rdi), %xmm15, %xmm1 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; VEX-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm1 +; VEX-NEXT: vcvtsi2ssq 48(%rdi), %xmm15, %xmm1 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; VEX-NEXT: vcvtsi2ssq 56(%rdi), %xmm2, %xmm1 +; VEX-NEXT: vcvtsi2ssq 56(%rdi), %xmm15, %xmm1 ; VEX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; VEX-NEXT: vcvtsi2ssq 8(%rdi), %xmm2, %xmm1 -; VEX-NEXT: vcvtsi2ssq (%rdi), %xmm2, %xmm2 +; VEX-NEXT: vcvtsi2ssq 8(%rdi), %xmm15, %xmm1 +; VEX-NEXT: vcvtsi2ssq (%rdi), %xmm15, %xmm2 ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; VEX-NEXT: vcvtsi2ssq 16(%rdi), %xmm3, %xmm2 +; VEX-NEXT: vcvtsi2ssq 16(%rdi), %xmm15, %xmm2 ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; VEX-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm2 +; VEX-NEXT: vcvtsi2ssq 24(%rdi), %xmm15, %xmm2 ; VEX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; VEX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: sitofp_load_8i64_to_8f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvtsi2ssq 40(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vcvtsi2ssq 32(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vcvtsi2ssq 40(%rdi), %xmm15, %xmm0 +; AVX512F-NEXT: vcvtsi2ssq 32(%rdi), %xmm15, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512F-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm1 +; AVX512F-NEXT: vcvtsi2ssq 48(%rdi), %xmm15, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vcvtsi2ssq 56(%rdi), %xmm2, %xmm1 +; AVX512F-NEXT: vcvtsi2ssq 56(%rdi), %xmm15, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512F-NEXT: vcvtsi2ssq 8(%rdi), %xmm2, %xmm1 -; AVX512F-NEXT: vcvtsi2ssq (%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq 8(%rdi), %xmm15, %xmm1 +; AVX512F-NEXT: vcvtsi2ssq (%rdi), %xmm15, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512F-NEXT: vcvtsi2ssq 16(%rdi), %xmm3, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq 16(%rdi), %xmm15, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512F-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm2 +; AVX512F-NEXT: vcvtsi2ssq 24(%rdi), %xmm15, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: sitofp_load_8i64_to_8f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtsi2ssq 40(%rdi), %xmm0, %xmm0 -; AVX512VL-NEXT: vcvtsi2ssq 32(%rdi), %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtsi2ssq 40(%rdi), %xmm15, %xmm0 +; AVX512VL-NEXT: vcvtsi2ssq 32(%rdi), %xmm15, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512VL-NEXT: vcvtsi2ssq 48(%rdi), %xmm2, %xmm1 +; AVX512VL-NEXT: vcvtsi2ssq 48(%rdi), %xmm15, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vcvtsi2ssq 56(%rdi), %xmm2, %xmm1 +; AVX512VL-NEXT: vcvtsi2ssq 56(%rdi), %xmm15, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512VL-NEXT: vcvtsi2ssq 8(%rdi), %xmm2, %xmm1 -; AVX512VL-NEXT: vcvtsi2ssq (%rdi), %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq 8(%rdi), %xmm15, %xmm1 +; AVX512VL-NEXT: vcvtsi2ssq (%rdi), %xmm15, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vcvtsi2ssq 16(%rdi), %xmm3, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq 16(%rdi), %xmm15, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512VL-NEXT: vcvtsi2ssq 24(%rdi), %xmm3, %xmm2 +; AVX512VL-NEXT: vcvtsi2ssq 24(%rdi), %xmm15, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq @@ -4235,16 +4235,16 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) { ; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] ; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 @@ -4261,16 +4261,16 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) { ; AVX2-NEXT: vpor %ymm1, %ymm2, %ymm1 ; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vmovq %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX2-NEXT: vpextrq $1, %xmm1, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm4, %xmm1 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] ; AVX2-NEXT: vaddps %xmm1, %xmm1, %xmm2 ; AVX2-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 @@ -4280,23 +4280,23 @@ define <4 x float> @uitofp_load_4i64_to_4f32(ptr%a) { ; ; AVX512F-LABEL: uitofp_load_4i64_to_4f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvtusi2ssq 8(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vcvtusi2ssq (%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vcvtusi2ssq 8(%rdi), %xmm15, %xmm0 +; AVX512F-NEXT: vcvtusi2ssq (%rdi), %xmm15, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512F-NEXT: vcvtusi2ssq 16(%rdi), %xmm2, %xmm1 +; AVX512F-NEXT: vcvtusi2ssq 16(%rdi), %xmm15, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vcvtusi2ssq 24(%rdi), %xmm2, %xmm1 +; AVX512F-NEXT: vcvtusi2ssq 24(%rdi), %xmm15, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_load_4i64_to_4f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtusi2ssq 8(%rdi), %xmm0, %xmm0 -; AVX512VL-NEXT: vcvtusi2ssq (%rdi), %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtusi2ssq 8(%rdi), %xmm15, %xmm0 +; AVX512VL-NEXT: vcvtusi2ssq (%rdi), %xmm15, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512VL-NEXT: vcvtusi2ssq 16(%rdi), %xmm2, %xmm1 +; AVX512VL-NEXT: vcvtusi2ssq 16(%rdi), %xmm15, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vcvtusi2ssq 24(%rdi), %xmm2, %xmm1 +; AVX512VL-NEXT: vcvtusi2ssq 24(%rdi), %xmm15, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; AVX512VL-NEXT: retq ; @@ -4664,16 +4664,16 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX1-NEXT: vorps %ymm3, %ymm4, %ymm3 ; AVX1-NEXT: vblendvpd %ymm1, %ymm3, %ymm1, %ymm3 ; AVX1-NEXT: vpextrq $1, %xmm3, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm4 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX1-NEXT: vmovq %xmm3, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm6 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm6 ; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm3 ; AVX1-NEXT: vmovq %xmm3, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm6 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm6 ; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] ; AVX1-NEXT: vpextrq $1, %xmm3, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm3 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[0] ; AVX1-NEXT: vaddps %xmm3, %xmm3, %xmm4 ; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1 @@ -4686,16 +4686,16 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX1-NEXT: vorps %ymm2, %ymm3, %ymm2 ; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm2 ; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm3 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm5 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm5 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vmovq %xmm2, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm5 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm5 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm5[0],xmm3[3] ; AVX1-NEXT: vpextrq $1, %xmm2, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm7, %xmm2 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] ; AVX1-NEXT: vaddps %xmm2, %xmm2, %xmm3 ; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0 @@ -4713,16 +4713,16 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX2-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm1, %ymm3 ; AVX2-NEXT: vpextrq $1, %xmm3, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX2-NEXT: vmovq %xmm3, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm5, %xmm5 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm5 ; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm3 ; AVX2-NEXT: vmovq %xmm3, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm5 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm5 ; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm5[0],xmm4[3] ; AVX2-NEXT: vpextrq $1, %xmm3, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[0] ; AVX2-NEXT: vaddps %xmm3, %xmm3, %xmm4 ; AVX2-NEXT: vpackssdw 48(%rdi), %xmm1, %xmm1 @@ -4732,16 +4732,16 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2 ; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm3 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vmovq %xmm2, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm4 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; AVX2-NEXT: vpextrq $1, %xmm2, %rax -; AVX2-NEXT: vcvtsi2ss %rax, %xmm6, %xmm2 +; AVX2-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX2-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[0] ; AVX2-NEXT: vaddps %xmm2, %xmm2, %xmm3 ; AVX2-NEXT: vpackssdw 16(%rdi), %xmm0, %xmm0 @@ -4751,38 +4751,38 @@ define <8 x float> @uitofp_load_8i64_to_8f32(ptr%a) { ; ; AVX512F-LABEL: uitofp_load_8i64_to_8f32: ; AVX512F: # %bb.0: -; AVX512F-NEXT: vcvtusi2ssq 40(%rdi), %xmm0, %xmm0 -; AVX512F-NEXT: vcvtusi2ssq 32(%rdi), %xmm1, %xmm1 +; AVX512F-NEXT: vcvtusi2ssq 40(%rdi), %xmm15, %xmm0 +; AVX512F-NEXT: vcvtusi2ssq 32(%rdi), %xmm15, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512F-NEXT: vcvtusi2ssq 48(%rdi), %xmm2, %xmm1 +; AVX512F-NEXT: vcvtusi2ssq 48(%rdi), %xmm15, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512F-NEXT: vcvtusi2ssq 56(%rdi), %xmm2, %xmm1 +; AVX512F-NEXT: vcvtusi2ssq 56(%rdi), %xmm15, %xmm1 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512F-NEXT: vcvtusi2ssq 8(%rdi), %xmm2, %xmm1 -; AVX512F-NEXT: vcvtusi2ssq (%rdi), %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq 8(%rdi), %xmm15, %xmm1 +; AVX512F-NEXT: vcvtusi2ssq (%rdi), %xmm15, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512F-NEXT: vcvtusi2ssq 16(%rdi), %xmm3, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq 16(%rdi), %xmm15, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512F-NEXT: vcvtusi2ssq 24(%rdi), %xmm3, %xmm2 +; AVX512F-NEXT: vcvtusi2ssq 24(%rdi), %xmm15, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512F-NEXT: retq ; ; AVX512VL-LABEL: uitofp_load_8i64_to_8f32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vcvtusi2ssq 40(%rdi), %xmm0, %xmm0 -; AVX512VL-NEXT: vcvtusi2ssq 32(%rdi), %xmm1, %xmm1 +; AVX512VL-NEXT: vcvtusi2ssq 40(%rdi), %xmm15, %xmm0 +; AVX512VL-NEXT: vcvtusi2ssq 32(%rdi), %xmm15, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3] -; AVX512VL-NEXT: vcvtusi2ssq 48(%rdi), %xmm2, %xmm1 +; AVX512VL-NEXT: vcvtusi2ssq 48(%rdi), %xmm15, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX512VL-NEXT: vcvtusi2ssq 56(%rdi), %xmm2, %xmm1 +; AVX512VL-NEXT: vcvtusi2ssq 56(%rdi), %xmm15, %xmm1 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] -; AVX512VL-NEXT: vcvtusi2ssq 8(%rdi), %xmm2, %xmm1 -; AVX512VL-NEXT: vcvtusi2ssq (%rdi), %xmm2, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq 8(%rdi), %xmm15, %xmm1 +; AVX512VL-NEXT: vcvtusi2ssq (%rdi), %xmm15, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] -; AVX512VL-NEXT: vcvtusi2ssq 16(%rdi), %xmm3, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq 16(%rdi), %xmm15, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] -; AVX512VL-NEXT: vcvtusi2ssq 24(%rdi), %xmm3, %xmm2 +; AVX512VL-NEXT: vcvtusi2ssq 24(%rdi), %xmm15, %xmm2 ; AVX512VL-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] ; AVX512VL-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512VL-NEXT: retq @@ -5148,7 +5148,7 @@ define float @extract0_sitofp_v4i32_f32i_multiuse1(<4 x i32> %x) nounwind { ; AVX-NEXT: vmovd %xmm0, %eax ; AVX-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX-NEXT: incl %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm1 ; AVX-NEXT: vdivss %xmm1, %xmm0, %xmm0 ; AVX-NEXT: retq %e = extractelement <4 x i32> %x, i32 0 @@ -5207,7 +5207,7 @@ define float @extract0_uitofp_v4i32_f32(<4 x i32> %x) nounwind { ; VEX-LABEL: extract0_uitofp_v4i32_f32: ; VEX: # %bb.0: ; VEX-NEXT: vmovd %xmm0, %eax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm0 +; VEX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: extract0_uitofp_v4i32_f32: @@ -5251,7 +5251,7 @@ define double @extract0_uitofp_v4i32_f64(<4 x i32> %x) nounwind { ; VEX-LABEL: extract0_uitofp_v4i32_f64: ; VEX: # %bb.0: ; VEX-NEXT: vmovd %xmm0, %eax -; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0 +; VEX-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: extract0_uitofp_v4i32_f64: @@ -5348,7 +5348,7 @@ define float @extract3_uitofp_v4i32_f32(<4 x i32> %x) nounwind { ; VEX-LABEL: extract3_uitofp_v4i32_f32: ; VEX: # %bb.0: ; VEX-NEXT: vextractps $3, %xmm0, %eax -; VEX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm0 +; VEX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: extract3_uitofp_v4i32_f32: @@ -5402,7 +5402,7 @@ define double @extract3_uitofp_v4i32_f64(<4 x i32> %x) nounwind { ; VEX-LABEL: extract3_uitofp_v4i32_f64: ; VEX: # %bb.0: ; VEX-NEXT: vextractps $3, %xmm0, %eax -; VEX-NEXT: vcvtsi2sd %rax, %xmm1, %xmm0 +; VEX-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; VEX-NEXT: retq ; ; AVX512F-LABEL: extract3_uitofp_v4i32_f64: diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll index 49062eaef3188..4a5b4277c3cca 100644 --- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll +++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll @@ -1,4 +1,4 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -O3 -mtriple=x86_64-pc-linux < %s | FileCheck %s ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX1 ; RUN: llc -O3 -mtriple=x86_64-pc-linux -mattr=+avx512f < %s | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F @@ -6504,7 +6504,7 @@ define <1 x double> @constrained_vector_sitofp_v1f64_v1i32(<1 x i32> %x) #0 { ; ; AVX-LABEL: constrained_vector_sitofp_v1f64_v1i32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2sd %edi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sd %edi, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call <1 x double> @@ -6522,7 +6522,7 @@ define <1 x float> @constrained_vector_sitofp_v1f32_v1i32(<1 x i32> %x) #0 { ; ; AVX-LABEL: constrained_vector_sitofp_v1f32_v1i32: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2ss %edi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %edi, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call <1 x float> @@ -6540,7 +6540,7 @@ define <1 x double> @constrained_vector_sitofp_v1f64_v1i64(<1 x i64> %x) #0 { ; ; AVX-LABEL: constrained_vector_sitofp_v1f64_v1i64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2sd %rdi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2sd %rdi, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call <1 x double> @@ -6558,7 +6558,7 @@ define <1 x float> @constrained_vector_sitofp_v1f32_v1i64(<1 x i64> %x) #0 { ; ; AVX-LABEL: constrained_vector_sitofp_v1f32_v1i64: ; AVX: # %bb.0: # %entry -; AVX-NEXT: vcvtsi2ss %rdi, %xmm0, %xmm0 +; AVX-NEXT: vcvtsi2ss %rdi, %xmm15, %xmm0 ; AVX-NEXT: retq entry: %result = call <1 x float> @@ -6622,18 +6622,18 @@ define <2 x double> @constrained_vector_sitofp_v2f64_v2i64(<2 x i64> %x) #0 { ; AVX1-LABEL: constrained_vector_sitofp_v2f64_v2i64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX1-NEXT: retq ; ; AVX512F-LABEL: constrained_vector_sitofp_v2f64_v2i64: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: retq ; @@ -6668,9 +6668,9 @@ define <2 x float> @constrained_vector_sitofp_v2f32_v2i64(<2 x i64> %x) #0 { ; AVX-LABEL: constrained_vector_sitofp_v2f32_v2i64: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vpextrq $1, %xmm0, %rax -; AVX-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX-NEXT: vmovq %xmm0, %rax -; AVX-NEXT: vcvtsi2ss %rax, %xmm2, %xmm0 +; AVX-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX-NEXT: retq entry: @@ -6703,12 +6703,12 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i32(<3 x i32> %x) #0 { ; AVX-LABEL: constrained_vector_sitofp_v3f64_v3i32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vextractps $1, %xmm0, %eax -; AVX-NEXT: vcvtsi2sd %eax, %xmm1, %xmm1 +; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm1 ; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vcvtsi2sd %eax, %xmm2, %xmm2 +; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm2 ; AVX-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX-NEXT: vpextrd $2, %xmm0, %eax -; AVX-NEXT: vcvtsi2sd %eax, %xmm3, %xmm0 +; AVX-NEXT: vcvtsi2sd %eax, %xmm15, %xmm0 ; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX-NEXT: retq entry: @@ -6740,12 +6740,12 @@ define <3 x float> @constrained_vector_sitofp_v3f32_v3i32(<3 x i32> %x) #0 { ; AVX-LABEL: constrained_vector_sitofp_v3f32_v3i32: ; AVX: # %bb.0: # %entry ; AVX-NEXT: vextractps $1, %xmm0, %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm1, %xmm1 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm1 ; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm2, %xmm2 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm2 ; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX-NEXT: vpextrd $2, %xmm0, %eax -; AVX-NEXT: vcvtsi2ss %eax, %xmm3, %xmm0 +; AVX-NEXT: vcvtsi2ss %eax, %xmm15, %xmm0 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX-NEXT: retq entry: @@ -6770,26 +6770,26 @@ define <3 x double> @constrained_vector_sitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; AVX1-LABEL: constrained_vector_sitofp_v3f64_v3i64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_sitofp_v3f64_v3i64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX512-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: retq entry: @@ -6814,13 +6814,13 @@ define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; AVX1-LABEL: constrained_vector_sitofp_v3f32_v3i64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -6828,13 +6828,13 @@ define <3 x float> @constrained_vector_sitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; AVX512-LABEL: constrained_vector_sitofp_v3f32_v3i64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -6910,14 +6910,14 @@ define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 { ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq @@ -6926,14 +6926,14 @@ define <4 x double> @constrained_vector_sitofp_v4f64_v4i64(<4 x i64> %x) #0 { ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -6977,16 +6977,16 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; AVX1-LABEL: constrained_vector_sitofp_v4f32_v4i64: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vmovq %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX1-NEXT: vpextrq $1, %xmm0, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq @@ -6994,16 +6994,16 @@ define <4 x float> @constrained_vector_sitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; AVX512F-LABEL: constrained_vector_sitofp_v4f32_v4i64: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq @@ -7033,12 +7033,12 @@ define <1 x double> @constrained_vector_uitofp_v1f64_v1i32(<1 x i32> %x) #0 { ; AVX1-LABEL: constrained_vector_uitofp_v1f64_v1i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtusi2sd %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtusi2sd %edi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %result = call <1 x double> @@ -7058,12 +7058,12 @@ define <1 x float> @constrained_vector_uitofp_v1f32_v1i32(<1 x i32> %x) #0 { ; AVX1-LABEL: constrained_vector_uitofp_v1f32_v1i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: movl %edi, %eax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v1f32_v1i32: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtusi2ss %edi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtusi2ss %edi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %result = call <1 x float> @@ -7099,7 +7099,7 @@ define <1 x double> @constrained_vector_uitofp_v1f64_v1i64(<1 x i64> %x) #0 { ; AVX1-NEXT: orq %rax, %rcx ; AVX1-NEXT: testq %rdi, %rdi ; AVX1-NEXT: cmovnsq %rdi, %rcx -; AVX1-NEXT: vcvtsi2sd %rcx, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2sd %rcx, %xmm15, %xmm0 ; AVX1-NEXT: jns .LBB175_2 ; AVX1-NEXT: # %bb.1: ; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0 @@ -7108,7 +7108,7 @@ define <1 x double> @constrained_vector_uitofp_v1f64_v1i64(<1 x i64> %x) #0 { ; ; AVX512-LABEL: constrained_vector_uitofp_v1f64_v1i64: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtusi2sd %rdi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtusi2sd %rdi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %result = call <1 x double> @@ -7144,7 +7144,7 @@ define <1 x float> @constrained_vector_uitofp_v1f32_v1i64(<1 x i64> %x) #0 { ; AVX1-NEXT: orq %rax, %rcx ; AVX1-NEXT: testq %rdi, %rdi ; AVX1-NEXT: cmovnsq %rdi, %rcx -; AVX1-NEXT: vcvtsi2ss %rcx, %xmm0, %xmm0 +; AVX1-NEXT: vcvtsi2ss %rcx, %xmm15, %xmm0 ; AVX1-NEXT: jns .LBB176_2 ; AVX1-NEXT: # %bb.1: ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 @@ -7153,7 +7153,7 @@ define <1 x float> @constrained_vector_uitofp_v1f32_v1i64(<1 x i64> %x) #0 { ; ; AVX512-LABEL: constrained_vector_uitofp_v1f32_v1i64: ; AVX512: # %bb.0: # %entry -; AVX512-NEXT: vcvtusi2ss %rdi, %xmm0, %xmm0 +; AVX512-NEXT: vcvtusi2ss %rdi, %xmm15, %xmm0 ; AVX512-NEXT: retq entry: %result = call <1 x float> @@ -7279,7 +7279,7 @@ define <2 x double> @constrained_vector_uitofp_v2f64_v2i64(<2 x i64> %x) #0 { ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2sd %rdx, %xmm1, %xmm1 +; AVX1-NEXT: vcvtsi2sd %rdx, %xmm15, %xmm1 ; AVX1-NEXT: jns .LBB179_2 ; AVX1-NEXT: # %bb.1: ; AVX1-NEXT: vaddsd %xmm1, %xmm1, %xmm1 @@ -7292,7 +7292,7 @@ define <2 x double> @constrained_vector_uitofp_v2f64_v2i64(<2 x i64> %x) #0 { ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm0 +; AVX1-NEXT: vcvtsi2sd %rdx, %xmm15, %xmm0 ; AVX1-NEXT: jns .LBB179_4 ; AVX1-NEXT: # %bb.3: ; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0 @@ -7303,9 +7303,9 @@ define <2 x double> @constrained_vector_uitofp_v2f64_v2i64(<2 x i64> %x) #0 { ; AVX512F-LABEL: constrained_vector_uitofp_v2f64_v2i64: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtusi2sd %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sd %rax, %xmm2, %xmm0 +; AVX512F-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] ; AVX512F-NEXT: retq ; @@ -7367,9 +7367,9 @@ define <2 x float> @constrained_vector_uitofp_v2f32_v2i64(<2 x i64> %x) #0 { ; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm2 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm1 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],zero,zero ; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm2 ; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 @@ -7381,9 +7381,9 @@ define <2 x float> @constrained_vector_uitofp_v2f32_v2i64(<2 x i64> %x) #0 { ; AVX512-LABEL: constrained_vector_uitofp_v2f32_v2i64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtusi2ss %rax, %xmm2, %xmm0 +; AVX512-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] ; AVX512-NEXT: retq entry: @@ -7416,24 +7416,24 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i32(<3 x i32> %x) #0 { ; AVX1-LABEL: constrained_vector_uitofp_v3f64_v3i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractps $1, %xmm0, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm1, %xmm1 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm0 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vextractps $1, %xmm0, %eax -; AVX512-NEXT: vcvtusi2sd %eax, %xmm1, %xmm1 +; AVX512-NEXT: vcvtusi2sd %eax, %xmm15, %xmm1 ; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vcvtusi2sd %eax, %xmm2, %xmm2 +; AVX512-NEXT: vcvtusi2sd %eax, %xmm15, %xmm2 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vpextrd $2, %xmm0, %eax -; AVX512-NEXT: vcvtusi2sd %eax, %xmm3, %xmm0 +; AVX512-NEXT: vcvtusi2sd %eax, %xmm15, %xmm0 ; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: retq entry: @@ -7465,24 +7465,24 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i32(<3 x i32> %x) #0 { ; AVX1-LABEL: constrained_vector_uitofp_v3f32_v3i32: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractps $1, %xmm0, %eax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm1, %xmm1 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX1-NEXT: vmovd %xmm0, %eax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm2 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX1-NEXT: vpextrd $2, %xmm0, %eax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm3, %xmm0 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm0 ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX1-NEXT: retq ; ; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i32: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vextractps $1, %xmm0, %eax -; AVX512-NEXT: vcvtusi2ss %eax, %xmm1, %xmm1 +; AVX512-NEXT: vcvtusi2ss %eax, %xmm15, %xmm1 ; AVX512-NEXT: vmovd %xmm0, %eax -; AVX512-NEXT: vcvtusi2ss %eax, %xmm2, %xmm2 +; AVX512-NEXT: vcvtusi2ss %eax, %xmm15, %xmm2 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512-NEXT: vpextrd $2, %xmm0, %eax -; AVX512-NEXT: vcvtusi2ss %eax, %xmm3, %xmm0 +; AVX512-NEXT: vcvtusi2ss %eax, %xmm15, %xmm0 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX512-NEXT: retq entry: @@ -7547,7 +7547,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2sd %rdx, %xmm1, %xmm1 +; AVX1-NEXT: vcvtsi2sd %rdx, %xmm15, %xmm1 ; AVX1-NEXT: jns .LBB183_2 ; AVX1-NEXT: # %bb.1: ; AVX1-NEXT: vaddsd %xmm1, %xmm1, %xmm1 @@ -7560,7 +7560,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2sd %rdx, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2sd %rdx, %xmm15, %xmm2 ; AVX1-NEXT: jns .LBB183_4 ; AVX1-NEXT: # %bb.3: ; AVX1-NEXT: vaddsd %xmm2, %xmm2, %xmm2 @@ -7575,7 +7575,7 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2sd %rdx, %xmm3, %xmm0 +; AVX1-NEXT: vcvtsi2sd %rdx, %xmm15, %xmm0 ; AVX1-NEXT: jns .LBB183_6 ; AVX1-NEXT: # %bb.5: ; AVX1-NEXT: vaddsd %xmm0, %xmm0, %xmm0 @@ -7586,13 +7586,13 @@ define <3 x double> @constrained_vector_uitofp_v3f64_v3i64(<3 x i64> %x) #0 { ; AVX512-LABEL: constrained_vector_uitofp_v3f64_v3i64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vcvtusi2sd %rax, %xmm1, %xmm1 +; AVX512-NEXT: vcvtusi2sd %rax, %xmm15, %xmm1 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; AVX512-NEXT: vcvtusi2sd %rax, %xmm15, %xmm2 ; AVX512-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 +; AVX512-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 ; AVX512-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX512-NEXT: retq entry: @@ -7657,7 +7657,7 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2ss %rdx, %xmm1, %xmm1 +; AVX1-NEXT: vcvtsi2ss %rdx, %xmm15, %xmm1 ; AVX1-NEXT: jns .LBB184_2 ; AVX1-NEXT: # %bb.1: ; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 @@ -7670,7 +7670,7 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2ss %rdx, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ss %rdx, %xmm15, %xmm2 ; AVX1-NEXT: jns .LBB184_4 ; AVX1-NEXT: # %bb.3: ; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 @@ -7685,7 +7685,7 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; AVX1-NEXT: orq %rcx, %rdx ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: cmovnsq %rax, %rdx -; AVX1-NEXT: vcvtsi2ss %rdx, %xmm3, %xmm0 +; AVX1-NEXT: vcvtsi2ss %rdx, %xmm15, %xmm0 ; AVX1-NEXT: jns .LBB184_6 ; AVX1-NEXT: # %bb.5: ; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 @@ -7697,13 +7697,13 @@ define <3 x float> @constrained_vector_uitofp_v3f32_v3i64(<3 x i64> %x) #0 { ; AVX512-LABEL: constrained_vector_uitofp_v3f32_v3i64: ; AVX512: # %bb.0: # %entry ; AVX512-NEXT: vpextrq $1, %xmm0, %rax -; AVX512-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; AVX512-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512-NEXT: vmovq %xmm0, %rax -; AVX512-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3] ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq @@ -7860,26 +7860,26 @@ define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 { ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vpextrd $2, %xmm1, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm2 ; AVX1-NEXT: vmovd %xmm1, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm3[0],xmm2[0] ; AVX1-NEXT: vextractps $2, %xmm0, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm3 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: movl %eax, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm4 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm3 = xmm4[0],xmm3[0] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2 ; AVX1-NEXT: vpextrd $3, %xmm1, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; AVX1-NEXT: vpextrd $1, %xmm1, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm5, %xmm1 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm1 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm3[0] ; AVX1-NEXT: vpextrd $3, %xmm0, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm5, %xmm3 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm3 ; AVX1-NEXT: vpextrd $1, %xmm0, %eax -; AVX1-NEXT: vcvtsi2sd %rax, %xmm5, %xmm0 +; AVX1-NEXT: vcvtsi2sd %rax, %xmm15, %xmm0 ; AVX1-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm3[0] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: vmulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 @@ -7890,14 +7890,14 @@ define <4 x double> @constrained_vector_uitofp_v4f64_v4i64(<4 x i64> %x) #0 { ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512F-NEXT: vpextrq $1, %xmm1, %rax -; AVX512F-NEXT: vcvtusi2sd %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2sd %rax, %xmm15, %xmm2 ; AVX512F-NEXT: vmovq %xmm1, %rax -; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm1 +; AVX512F-NEXT: vcvtusi2sd %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vcvtusi2sd %rax, %xmm15, %xmm2 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2sd %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vcvtusi2sd %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm2[0] ; AVX512F-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX512F-NEXT: retq @@ -7991,16 +7991,16 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; AVX1-NEXT: vorpd %ymm3, %ymm1, %ymm1 ; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm1 ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm3 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm3 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1 ; AVX1-NEXT: vmovq %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm4 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm4 ; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1],xmm4[0],xmm3[3] ; AVX1-NEXT: vpextrq $1, %xmm1, %rax -; AVX1-NEXT: vcvtsi2ss %rax, %xmm5, %xmm1 +; AVX1-NEXT: vcvtsi2ss %rax, %xmm15, %xmm1 ; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[0] ; AVX1-NEXT: vaddps %xmm1, %xmm1, %xmm3 ; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0 @@ -8011,16 +8011,16 @@ define <4 x float> @constrained_vector_uitofp_v4f32_v4i64(<4 x i64> %x) #0 { ; AVX512F-LABEL: constrained_vector_uitofp_v4f32_v4i64: ; AVX512F: # %bb.0: # %entry ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm1, %xmm1 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm1 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm2, %xmm2 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[2,3] ; AVX512F-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512F-NEXT: vmovq %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm2 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm2 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] ; AVX512F-NEXT: vpextrq $1, %xmm0, %rax -; AVX512F-NEXT: vcvtusi2ss %rax, %xmm3, %xmm0 +; AVX512F-NEXT: vcvtusi2ss %rax, %xmm15, %xmm0 ; AVX512F-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index 6392514bf4157..bc1650a4acf0b 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -1403,10 +1403,10 @@ void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS) { for (const auto &RC : RegisterClasses) { if (!RC.AltOrderSelect.empty()) { OS << "\nstatic inline unsigned " << RC.getName() - << "AltOrderSelect(const MachineFunction &MF) {" << RC.AltOrderSelect - << "}\n\n" + << "AltOrderSelect(const MachineFunction &MF, bool Rev) {" + << RC.AltOrderSelect << "}\n\n" << "static ArrayRef " << RC.getName() - << "GetRawAllocationOrder(const MachineFunction &MF) {\n"; + << "GetRawAllocationOrder(const MachineFunction &MF, bool Rev) {\n"; for (unsigned oi = 1, oe = RC.getNumOrders(); oi != oe; ++oi) { ArrayRef Elems = RC.getOrder(oi); if (!Elems.empty()) { @@ -1426,8 +1426,8 @@ void RegisterInfoEmitter::runTargetDesc(raw_ostream &OS) { else OS << "),\n ArrayRef(AltOrder" << oi; OS << ")\n };\n const unsigned Select = " << RC.getName() - << "AltOrderSelect(MF);\n assert(Select < " << RC.getNumOrders() - << ");\n return Order[Select];\n}\n"; + << "AltOrderSelect(MF, Rev);\n assert(Select < " + << RC.getNumOrders() << ");\n return Order[Select];\n}\n"; } } From fa9e1a1515549124dd76ddc55a8a532795d51fae Mon Sep 17 00:00:00 2001 From: RonDahan101 <166982786+RonDahan101@users.noreply.github.com> Date: Wed, 11 Jun 2025 17:15:24 +0300 Subject: [PATCH 084/851] [AArch64] Expand llvm.histogram intrinsic to support umax, umin, and uadd.sat operations (#138447) This patch extends the llvm.histogram intrinsic to support additional update operations beyond the existing add. Specifically, the new supported operations are: * umax: unsigned maximum * umin: unsigned minimum * uadd.sat: unsigned saturated addition Based on the discussion from: https://discourse.llvm.org/t/rfc-expanding-the-experimental-histogram-intrinsic/84673 --- llvm/docs/LangRef.rst | 3 + llvm/include/llvm/IR/Intrinsics.td | 18 + .../Scalar/ScalarizeMaskedMemIntrin.cpp | 36 +- .../AArch64/neon-scalarize-histogram.ll | 354 ++++++++++++++++++ 4 files changed, 407 insertions(+), 4 deletions(-) diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst index 78604d0df6bc6..cc72a37f68599 100644 --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -20545,6 +20545,9 @@ More update operation types may be added in the future. declare void @llvm.experimental.vector.histogram.add.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask) declare void @llvm.experimental.vector.histogram.add.nxv2p0.i64( %ptrs, i64 %inc, %mask) + declare void @llvm.experimental.vector.histogram.uadd.sat.v8p0.i32(<8 x ptr> %ptrs, i32 %inc, <8 x i1> %mask) + declare void @llvm.experimental.vector.histogram.umax.v8p0.i32(<8 x ptr> %ptrs, i32 %val, <8 x i1> %mask) + declare void @llvm.experimental.vector.histogram.umin.v8p0.i32(<8 x ptr> %ptrs, i32 %val, <8 x i1> %mask) Arguments: """""""""" diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td index e68243c2e406b..7add4a27ce9e9 100644 --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1968,6 +1968,24 @@ def int_experimental_vector_histogram_add : DefaultAttrsIntrinsic<[], LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask [ IntrArgMemOnly ]>; +def int_experimental_vector_histogram_uadd_sat : DefaultAttrsIntrinsic<[], + [ llvm_anyvector_ty, // Vector of pointers + llvm_anyint_ty, // Increment + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask + [ IntrArgMemOnly ]>; + +def int_experimental_vector_histogram_umin : DefaultAttrsIntrinsic<[], + [ llvm_anyvector_ty, // Vector of pointers + llvm_anyint_ty, // Update value + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask + [ IntrArgMemOnly ]>; + +def int_experimental_vector_histogram_umax : DefaultAttrsIntrinsic<[], + [ llvm_anyvector_ty, // Vector of pointers + llvm_anyint_ty, // Update value + LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>], // Mask + [ IntrArgMemOnly ]>; + // Experimental match def int_experimental_vector_match : DefaultAttrsIntrinsic< [ LLVMScalarOrSameVectorWidth<0, llvm_i1_ty> ], diff --git a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp index e24088c294987..42d6680c3cb7d 100644 --- a/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp +++ b/llvm/lib/Transforms/Scalar/ScalarizeMaskedMemIntrin.cpp @@ -968,6 +968,29 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI, // FIXME: Do we need to add an alignment parameter to the intrinsic? unsigned VectorWidth = AddrType->getNumElements(); + auto CreateHistogramUpdateValue = [&](IntrinsicInst *CI, Value *Load, + Value *Inc) -> Value * { + Value *UpdateOp; + switch (CI->getIntrinsicID()) { + case Intrinsic::experimental_vector_histogram_add: + UpdateOp = Builder.CreateAdd(Load, Inc); + break; + case Intrinsic::experimental_vector_histogram_uadd_sat: + UpdateOp = + Builder.CreateIntrinsic(Intrinsic::uadd_sat, {EltTy}, {Load, Inc}); + break; + case Intrinsic::experimental_vector_histogram_umin: + UpdateOp = Builder.CreateIntrinsic(Intrinsic::umin, {EltTy}, {Load, Inc}); + break; + case Intrinsic::experimental_vector_histogram_umax: + UpdateOp = Builder.CreateIntrinsic(Intrinsic::umax, {EltTy}, {Load, Inc}); + break; + + default: + llvm_unreachable("Unexpected histogram intrinsic"); + } + return UpdateOp; + }; // Shorten the way if the mask is a vector of constants. if (isConstantIntVector(Mask)) { @@ -976,8 +999,9 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI, continue; Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx)); LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx)); - Value *Add = Builder.CreateAdd(Load, Inc); - Builder.CreateStore(Add, Ptr); + Value *Update = + CreateHistogramUpdateValue(cast(CI), Load, Inc); + Builder.CreateStore(Update, Ptr); } CI->eraseFromParent(); return; @@ -997,8 +1021,9 @@ static void scalarizeMaskedVectorHistogram(const DataLayout &DL, CallInst *CI, Builder.SetInsertPoint(CondBlock->getTerminator()); Value *Ptr = Builder.CreateExtractElement(Ptrs, Idx, "Ptr" + Twine(Idx)); LoadInst *Load = Builder.CreateLoad(EltTy, Ptr, "Load" + Twine(Idx)); - Value *Add = Builder.CreateAdd(Load, Inc); - Builder.CreateStore(Add, Ptr); + Value *UpdateOp = + CreateHistogramUpdateValue(cast(CI), Load, Inc); + Builder.CreateStore(UpdateOp, Ptr); // Create "else" block, fill it in the next iteration BasicBlock *NewIfBlock = ThenTerm->getSuccessor(0); @@ -1089,6 +1114,9 @@ static bool optimizeCallInst(CallInst *CI, bool &ModifiedDT, default: break; case Intrinsic::experimental_vector_histogram_add: + case Intrinsic::experimental_vector_histogram_uadd_sat: + case Intrinsic::experimental_vector_histogram_umin: + case Intrinsic::experimental_vector_histogram_umax: if (TTI.isLegalMaskedVectorHistogram(CI->getArgOperand(0)->getType(), CI->getArgOperand(1)->getType())) return false; diff --git a/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll b/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll index e59d9098a30d6..ca74b4e95b0ae 100644 --- a/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll +++ b/llvm/test/CodeGen/AArch64/neon-scalarize-histogram.ll @@ -112,3 +112,357 @@ define void @histogram_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) { call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> ) ret void } + +define void @histogram_uadd_sat_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) { +; CHECK-LABEL: histogram_uadd_sat_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: tbnz w8, #0, .LBB3_3 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: tbnz w8, #0, .LBB3_4 +; CHECK-NEXT: .LBB3_2: // %else2 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB3_3: // %cond.histogram.update +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: adds x9, x9, x0 +; CHECK-NEXT: csinv x9, x9, xzr, lo +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: tbz w8, #0, .LBB3_2 +; CHECK-NEXT: .LBB3_4: // %cond.histogram.update1 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: adds x9, x9, x0 +; CHECK-NEXT: csinv x9, x9, xzr, lo +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: ret + call void @llvm.experimental.vector.histogram.uadd.sat.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) + ret void +} + +define void @histogram_uadd_sat_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) { +; CHECK-LABEL: histogram_uadd_sat_i32_literal: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v2.2d, x0 +; CHECK-NEXT: sshll v3.2d, v0.2s, #2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: add v3.2d, v2.2d, v3.2d +; CHECK-NEXT: tbz w8, #0, .LBB4_2 +; CHECK-NEXT: // %bb.1: // %cond.histogram.update +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: adds w9, w9, #1 +; CHECK-NEXT: csinv w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: .LBB4_2: // %else +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2 +; CHECK-NEXT: tbz w8, #0, .LBB4_4 +; CHECK-NEXT: // %bb.3: // %cond.histogram.update1 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: adds w9, w9, #1 +; CHECK-NEXT: csinv w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: .LBB4_4: // %else2 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: add v0.2d, v2.2d, v0.2d +; CHECK-NEXT: tbnz w8, #0, .LBB4_7 +; CHECK-NEXT: // %bb.5: // %else4 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: tbnz w8, #0, .LBB4_8 +; CHECK-NEXT: .LBB4_6: // %else6 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB4_7: // %cond.histogram.update3 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: adds w9, w9, #1 +; CHECK-NEXT: csinv w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: tbz w8, #0, .LBB4_6 +; CHECK-NEXT: .LBB4_8: // %cond.histogram.update5 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: adds w9, w9, #1 +; CHECK-NEXT: csinv w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ret + %buckets = getelementptr i32, ptr %base, <4 x i32> %indices + call void @llvm.experimental.vector.histogram.uadd.sat.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask) + ret void +} + +define void @histogram_uadd_sat_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) { +; CHECK-LABEL: histogram_uadd_sat_i32_literal_alltruemask: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v1.2d, x0 +; CHECK-NEXT: sshll v2.2d, v0.2s, #2 +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2 +; CHECK-NEXT: add v2.2d, v1.2d, v2.2d +; CHECK-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: mov x10, v2.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: adds w9, w9, #1 +; CHECK-NEXT: csinv w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr w8, [x10] +; CHECK-NEXT: adds w8, w8, #1 +; CHECK-NEXT: csinv w8, w8, wzr, lo +; CHECK-NEXT: str w8, [x10] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: adds w9, w9, #1 +; CHECK-NEXT: csinv w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr w8, [x10] +; CHECK-NEXT: adds w8, w8, #1 +; CHECK-NEXT: csinv w8, w8, wzr, lo +; CHECK-NEXT: str w8, [x10] +; CHECK-NEXT: ret + %buckets = getelementptr i32, ptr %base, <4 x i32> %indices + call void @llvm.experimental.vector.histogram.uadd.sat.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> ) + ret void +} + +define void @histogram_umax_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) { +; CHECK-LABEL: histogram_umax_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: tbnz w8, #0, .LBB6_3 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: tbnz w8, #0, .LBB6_4 +; CHECK-NEXT: .LBB6_2: // %else2 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB6_3: // %cond.histogram.update +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: cmp x9, x0 +; CHECK-NEXT: csel x9, x9, x0, hi +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: tbz w8, #0, .LBB6_2 +; CHECK-NEXT: .LBB6_4: // %cond.histogram.update1 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: cmp x9, x0 +; CHECK-NEXT: csel x9, x9, x0, hi +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: ret + call void @llvm.experimental.vector.histogram.umax.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) + ret void +} + +define void @histogram_umax_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) { +; CHECK-LABEL: histogram_umax_i32_literal: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v2.2d, x0 +; CHECK-NEXT: sshll v3.2d, v0.2s, #2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: add v3.2d, v2.2d, v3.2d +; CHECK-NEXT: tbz w8, #0, .LBB7_2 +; CHECK-NEXT: // %bb.1: // %cond.histogram.update +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, hi +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: .LBB7_2: // %else +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2 +; CHECK-NEXT: tbz w8, #0, .LBB7_4 +; CHECK-NEXT: // %bb.3: // %cond.histogram.update1 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, hi +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: .LBB7_4: // %else2 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: add v0.2d, v2.2d, v0.2d +; CHECK-NEXT: tbnz w8, #0, .LBB7_7 +; CHECK-NEXT: // %bb.5: // %else4 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: tbnz w8, #0, .LBB7_8 +; CHECK-NEXT: .LBB7_6: // %else6 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB7_7: // %cond.histogram.update3 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, hi +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: tbz w8, #0, .LBB7_6 +; CHECK-NEXT: .LBB7_8: // %cond.histogram.update5 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, hi +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ret + %buckets = getelementptr i32, ptr %base, <4 x i32> %indices + call void @llvm.experimental.vector.histogram.umax.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask) + ret void +} + +define void @histogram_umax_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) { +; CHECK-LABEL: histogram_umax_i32_literal_alltruemask: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v1.2d, x0 +; CHECK-NEXT: sshll v2.2d, v0.2s, #2 +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2 +; CHECK-NEXT: add v2.2d, v1.2d, v2.2d +; CHECK-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: mov x10, v2.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, hi +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr w8, [x10] +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: csinc w8, w8, wzr, hi +; CHECK-NEXT: str w8, [x10] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, hi +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr w8, [x10] +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: csinc w8, w8, wzr, hi +; CHECK-NEXT: str w8, [x10] +; CHECK-NEXT: ret + %buckets = getelementptr i32, ptr %base, <4 x i32> %indices + call void @llvm.experimental.vector.histogram.umax.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> ) + ret void +} + +define void @histogram_umin_i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) { +; CHECK-LABEL: histogram_umin_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: fmov w8, s1 +; CHECK-NEXT: tbnz w8, #0, .LBB9_3 +; CHECK-NEXT: // %bb.1: // %else +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: tbnz w8, #0, .LBB9_4 +; CHECK-NEXT: .LBB9_2: // %else2 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB9_3: // %cond.histogram.update +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: cmp x9, x0 +; CHECK-NEXT: csel x9, x9, x0, lo +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: mov w8, v1.s[1] +; CHECK-NEXT: tbz w8, #0, .LBB9_2 +; CHECK-NEXT: .LBB9_4: // %cond.histogram.update1 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: ldr x9, [x8] +; CHECK-NEXT: cmp x9, x0 +; CHECK-NEXT: csel x9, x9, x0, lo +; CHECK-NEXT: str x9, [x8] +; CHECK-NEXT: ret + call void @llvm.experimental.vector.histogram.umin.nxv2p0.i64(<2 x ptr> %buckets, i64 %inc, <2 x i1> %mask) + ret void +} + +define void @histogram_umin_i32_literal(ptr %base, <4 x i32> %indices, <4 x i1> %mask) { +; CHECK-LABEL: histogram_umin_i32_literal: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v2.2d, x0 +; CHECK-NEXT: sshll v3.2d, v0.2s, #2 +; CHECK-NEXT: // kill: def $d1 killed $d1 def $q1 +; CHECK-NEXT: umov w8, v1.h[0] +; CHECK-NEXT: add v3.2d, v2.2d, v3.2d +; CHECK-NEXT: tbz w8, #0, .LBB10_2 +; CHECK-NEXT: // %bb.1: // %cond.histogram.update +; CHECK-NEXT: fmov x8, d3 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: .LBB10_2: // %else +; CHECK-NEXT: umov w8, v1.h[1] +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2 +; CHECK-NEXT: tbz w8, #0, .LBB10_4 +; CHECK-NEXT: // %bb.3: // %cond.histogram.update1 +; CHECK-NEXT: mov x8, v3.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: .LBB10_4: // %else2 +; CHECK-NEXT: umov w8, v1.h[2] +; CHECK-NEXT: add v0.2d, v2.2d, v0.2d +; CHECK-NEXT: tbnz w8, #0, .LBB10_7 +; CHECK-NEXT: // %bb.5: // %else4 +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: tbnz w8, #0, .LBB10_8 +; CHECK-NEXT: .LBB10_6: // %else6 +; CHECK-NEXT: ret +; CHECK-NEXT: .LBB10_7: // %cond.histogram.update3 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: umov w8, v1.h[3] +; CHECK-NEXT: tbz w8, #0, .LBB10_6 +; CHECK-NEXT: .LBB10_8: // %cond.histogram.update5 +; CHECK-NEXT: mov x8, v0.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ret + %buckets = getelementptr i32, ptr %base, <4 x i32> %indices + call void @llvm.experimental.vector.histogram.umin.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> %mask) + ret void +} + +define void @histogram_umin_i32_literal_alltruemask(ptr %base, <4 x i32> %indices) { +; CHECK-LABEL: histogram_umin_i32_literal_alltruemask: +; CHECK: // %bb.0: +; CHECK-NEXT: dup v1.2d, x0 +; CHECK-NEXT: sshll v2.2d, v0.2s, #2 +; CHECK-NEXT: sshll2 v0.2d, v0.4s, #2 +; CHECK-NEXT: add v2.2d, v1.2d, v2.2d +; CHECK-NEXT: add v0.2d, v1.2d, v0.2d +; CHECK-NEXT: fmov x8, d2 +; CHECK-NEXT: mov x10, v2.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr w8, [x10] +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: csinc w8, w8, wzr, lo +; CHECK-NEXT: str w8, [x10] +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: mov x10, v0.d[1] +; CHECK-NEXT: ldr w9, [x8] +; CHECK-NEXT: cmp w9, #1 +; CHECK-NEXT: csinc w9, w9, wzr, lo +; CHECK-NEXT: str w9, [x8] +; CHECK-NEXT: ldr w8, [x10] +; CHECK-NEXT: cmp w8, #1 +; CHECK-NEXT: csinc w8, w8, wzr, lo +; CHECK-NEXT: str w8, [x10] +; CHECK-NEXT: ret + %buckets = getelementptr i32, ptr %base, <4 x i32> %indices + call void @llvm.experimental.vector.histogram.umin.nxv4p0.i32(<4 x ptr> %buckets, i32 1, <4 x i1> ) + ret void +} From 775ad3e49c83407b79dd5ad533204884cb8b23ce Mon Sep 17 00:00:00 2001 From: Razvan Lupusoru Date: Wed, 11 Jun 2025 07:16:58 -0700 Subject: [PATCH 085/851] [flang][acc] Ensure all acc.loop get a default parallelism determination mode (#143623) This PR updates the flang lowering to explicitly implement the OpenACC rules: - As per OpenACC 3.3 standard section 2.9.6 independent clause: A loop construct with no auto or seq clause is treated as if it has the independent clause when it is an orphaned loop construct or its parent compute construct is a parallel construct. - As per OpenACC 3.3 standard section 2.9.7 auto clause: When the parent compute construct is a kernels construct, a loop construct with no independent or seq clause is treated as if it has the auto clause. - Loops in serial regions are `seq` if they have no other parallelism marking such as gang, worker, vector. For now the `acc.loop` verifier has not yet been updated to enforce this. --- flang/lib/Lower/OpenACC.cpp | 67 +++++++++++++++++++ flang/test/Lower/OpenACC/acc-kernels-loop.f90 | 28 ++++---- flang/test/Lower/OpenACC/acc-loop.f90 | 52 +++++++------- .../test/Lower/OpenACC/acc-parallel-loop.f90 | 28 ++++---- flang/test/Lower/OpenACC/acc-serial-loop.f90 | 28 ++++---- mlir/include/mlir/Dialect/OpenACC/OpenACC.h | 6 ++ mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 11 +++ 7 files changed, 152 insertions(+), 68 deletions(-) diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp index c10e1777614cd..69e9c53baa740 100644 --- a/flang/lib/Lower/OpenACC.cpp +++ b/flang/lib/Lower/OpenACC.cpp @@ -2150,6 +2150,70 @@ privatizeIv(Fortran::lower::AbstractConverter &converter, ivPrivate.push_back(privateValue); } +static void determineDefaultLoopParMode( + Fortran::lower::AbstractConverter &converter, mlir::acc::LoopOp &loopOp, + llvm::SmallVector &seqDeviceTypes, + llvm::SmallVector &independentDeviceTypes, + llvm::SmallVector &autoDeviceTypes) { + auto hasDeviceNone = [](mlir::Attribute attr) -> bool { + return mlir::dyn_cast(attr).getValue() == + mlir::acc::DeviceType::None; + }; + bool hasDefaultSeq = llvm::any_of(seqDeviceTypes, hasDeviceNone); + bool hasDefaultIndependent = + llvm::any_of(independentDeviceTypes, hasDeviceNone); + bool hasDefaultAuto = llvm::any_of(autoDeviceTypes, hasDeviceNone); + if (hasDefaultSeq || hasDefaultIndependent || hasDefaultAuto) + return; // Default loop par mode is already specified. + + mlir::Region *currentRegion = + converter.getFirOpBuilder().getBlock()->getParent(); + mlir::Operation *parentOp = mlir::acc::getEnclosingComputeOp(*currentRegion); + const bool isOrphanedLoop = !parentOp; + if (isOrphanedLoop || + mlir::isa_and_present(parentOp)) { + // As per OpenACC 3.3 standard section 2.9.6 independent clause: + // A loop construct with no auto or seq clause is treated as if it has the + // independent clause when it is an orphaned loop construct or its parent + // compute construct is a parallel construct. + independentDeviceTypes.push_back(mlir::acc::DeviceTypeAttr::get( + converter.getFirOpBuilder().getContext(), mlir::acc::DeviceType::None)); + } else if (mlir::isa_and_present(parentOp)) { + // Serial construct implies `seq` clause on loop. However, this + // conflicts with parallelism assignment if already set. Therefore check + // that first. + bool hasDefaultGangWorkerOrVector = + loopOp.hasVector() || loopOp.getVectorValue() || loopOp.hasWorker() || + loopOp.getWorkerValue() || loopOp.hasGang() || + loopOp.getGangValue(mlir::acc::GangArgType::Num) || + loopOp.getGangValue(mlir::acc::GangArgType::Dim) || + loopOp.getGangValue(mlir::acc::GangArgType::Static); + if (!hasDefaultGangWorkerOrVector) + seqDeviceTypes.push_back(mlir::acc::DeviceTypeAttr::get( + converter.getFirOpBuilder().getContext(), + mlir::acc::DeviceType::None)); + // Since the loop has some parallelism assigned - we cannot assign `seq`. + // However, the `acc.loop` verifier will check that one of seq, independent, + // or auto is marked. Seems reasonable to mark as auto since the OpenACC + // spec does say "If not, or if it is unable to make a determination, it + // must treat the auto clause as if it is a seq clause, and it must + // ignore any gang, worker, or vector clauses on the loop construct" + else + autoDeviceTypes.push_back(mlir::acc::DeviceTypeAttr::get( + converter.getFirOpBuilder().getContext(), + mlir::acc::DeviceType::None)); + } else { + // As per OpenACC 3.3 standard section 2.9.7 auto clause: + // When the parent compute construct is a kernels construct, a loop + // construct with no independent or seq clause is treated as if it has the + // auto clause. + assert(mlir::isa_and_present(parentOp) && + "Expected kernels construct"); + autoDeviceTypes.push_back(mlir::acc::DeviceTypeAttr::get( + converter.getFirOpBuilder().getContext(), mlir::acc::DeviceType::None)); + } +} + static mlir::acc::LoopOp createLoopOp( Fortran::lower::AbstractConverter &converter, mlir::Location currentLocation, @@ -2482,6 +2546,9 @@ static mlir::acc::LoopOp createLoopOp( loopOp.setTileOperandsSegmentsAttr( builder.getDenseI32ArrayAttr(tileOperandsSegments)); + // Determine the loop's default par mode - either seq, independent, or auto. + determineDefaultLoopParMode(converter, loopOp, seqDeviceTypes, + independentDeviceTypes, autoDeviceTypes); if (!seqDeviceTypes.empty()) loopOp.setSeqAttr(builder.getArrayAttr(seqDeviceTypes)); if (!independentDeviceTypes.empty()) diff --git a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 b/flang/test/Lower/OpenACC/acc-kernels-loop.f90 index 8608b0ad98ce6..4e968144399a8 100644 --- a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 +++ b/flang/test/Lower/OpenACC/acc-kernels-loop.f90 @@ -47,7 +47,7 @@ subroutine acc_kernels_loop ! CHECK: acc.kernels { ! CHECK: acc.loop private{{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type]{{.*}}} ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} @@ -59,7 +59,7 @@ subroutine acc_kernels_loop ! CHECK: acc.kernels combined(loop) { ! CHECK: acc.loop combined(kernels) private{{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type]{{.*}}} ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} @@ -490,7 +490,7 @@ subroutine acc_kernels_loop ! CHECK: acc.kernels {{.*}} { ! CHECK: acc.loop {{.*}} gang {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array}{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} @@ -503,7 +503,7 @@ subroutine acc_kernels_loop ! CHECK: [[GANGNUM1:%.*]] = arith.constant 8 : i32 ! CHECK: acc.loop {{.*}} gang({num=[[GANGNUM1]] : i32}) {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} @@ -516,7 +516,7 @@ subroutine acc_kernels_loop ! CHECK: [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref ! CHECK: acc.loop {{.*}} gang({num=[[GANGNUM2]] : i32}) {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} @@ -528,7 +528,7 @@ subroutine acc_kernels_loop ! CHECK: acc.kernels {{.*}} { ! CHECK: acc.loop {{.*}} gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} @@ -540,7 +540,7 @@ subroutine acc_kernels_loop ! CHECK: acc.kernels {{.*}} { ! CHECK: acc.loop {{.*}} vector {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array}{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} @@ -553,7 +553,7 @@ subroutine acc_kernels_loop ! CHECK: [[CONSTANT128:%.*]] = arith.constant 128 : i32 ! CHECK: acc.loop {{.*}} vector([[CONSTANT128]] : i32) {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} @@ -566,7 +566,7 @@ subroutine acc_kernels_loop ! CHECK: [[VECTORLENGTH:%.*]] = fir.load %{{.*}} : !fir.ref ! CHECK: acc.loop {{.*}} vector([[VECTORLENGTH]] : i32) {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} @@ -578,7 +578,7 @@ subroutine acc_kernels_loop ! CHECK: acc.kernels {{.*}} { ! CHECK: acc.loop {{.*}} worker {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array}{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} @@ -591,7 +591,7 @@ subroutine acc_kernels_loop ! CHECK: [[WORKER128:%.*]] = arith.constant 128 : i32 ! CHECK: acc.loop {{.*}} worker([[WORKER128]] : i32) {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} @@ -605,7 +605,7 @@ subroutine acc_kernels_loop ! CHECK: acc.kernels {{.*}} { ! CHECK: acc.loop {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type], inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {{{.*}}collapse = [2], collapseDeviceType = [#acc.device_type]{{.*}}} ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} @@ -621,9 +621,9 @@ subroutine acc_kernels_loop ! CHECK: acc.loop {{.*}} { ! CHECK: acc.loop {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type]{{.*}}} ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type]{{.*}}} ! CHECK: acc.terminator ! CHECK-NEXT: }{{$}} diff --git a/flang/test/Lower/OpenACC/acc-loop.f90 b/flang/test/Lower/OpenACC/acc-loop.f90 index 0246f60705898..5baa485534b2a 100644 --- a/flang/test/Lower/OpenACC/acc-loop.f90 +++ b/flang/test/Lower/OpenACC/acc-loop.f90 @@ -29,7 +29,7 @@ program acc_loop ! CHECK: acc.loop private(@privatization_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array}{{$}} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]}{{$}} !$acc loop seq DO i = 1, n @@ -65,7 +65,7 @@ program acc_loop ! CHECK: acc.loop gang private(@privatization_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop gang(num: 8) DO i = 1, n @@ -75,7 +75,7 @@ program acc_loop ! CHECK: [[GANGNUM1:%.*]] = arith.constant 8 : i32 ! CHECK: acc.loop gang({num=[[GANGNUM1]] : i32}) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop gang(num: gangNum) DO i = 1, n @@ -85,7 +85,7 @@ program acc_loop ! CHECK: [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref ! CHECK: acc.loop gang({num=[[GANGNUM2]] : i32}) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop gang(num: gangNum, static: gangStatic) DO i = 1, n @@ -94,7 +94,7 @@ program acc_loop ! CHECK: acc.loop gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop vector DO i = 1, n @@ -103,7 +103,7 @@ program acc_loop ! CHECK: acc.loop vector private(@privatization_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop vector(128) DO i = 1, n @@ -113,7 +113,7 @@ program acc_loop ! CHECK: [[CONSTANT128:%.*]] = arith.constant 128 : i32 ! CHECK: acc.loop vector([[CONSTANT128]] : i32) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop vector(vectorLength) DO i = 1, n @@ -123,7 +123,7 @@ program acc_loop ! CHECK: [[VECTORLENGTH:%.*]] = fir.load %{{.*}} : !fir.ref ! CHECK: acc.loop vector([[VECTORLENGTH]] : i32) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop worker DO i = 1, n @@ -132,7 +132,7 @@ program acc_loop ! CHECK: acc.loop worker private(@privatization_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop worker(128) DO i = 1, n @@ -142,7 +142,7 @@ program acc_loop ! CHECK: [[WORKER128:%.*]] = arith.constant 128 : i32 ! CHECK: acc.loop worker([[WORKER128]] : i32) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop private(c) DO i = 1, n @@ -151,7 +151,7 @@ program acc_loop ! CHECK: acc.loop private(@privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref>, @privatization_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} ! When the induction variable is explicitly private - only a single private entry should be created. !$acc loop private(i) @@ -161,7 +161,7 @@ program acc_loop ! CHECK: acc.loop private(@privatization_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop private(c, d) DO i = 1, n @@ -170,7 +170,7 @@ program acc_loop ! CHECK: acc.loop private(@privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref>, @privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref>, @privatization_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop private(c) private(d) DO i = 1, n @@ -179,7 +179,7 @@ program acc_loop ! CHECK: acc.loop private(@privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref>, @privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref>, @privatization_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop tile(2) DO i = 1, n @@ -189,7 +189,7 @@ program acc_loop ! CHECK: [[TILESIZE:%.*]] = arith.constant 2 : i32 ! CHECK: acc.loop {{.*}} tile({[[TILESIZE]] : i32}) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop tile(*) DO i = 1, n @@ -198,7 +198,7 @@ program acc_loop ! CHECK: [[TILESIZEM1:%.*]] = arith.constant -1 : i32 ! CHECK: acc.loop {{.*}} tile({[[TILESIZEM1]] : i32}) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop tile(2, 2) DO i = 1, n @@ -211,7 +211,7 @@ program acc_loop ! CHECK: [[TILESIZE2:%.*]] = arith.constant 2 : i32 ! CHECK: acc.loop {{.*}} tile({[[TILESIZE1]] : i32, [[TILESIZE2]] : i32}) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop tile(tileSize) DO i = 1, n @@ -220,7 +220,7 @@ program acc_loop ! CHECK: acc.loop {{.*}} tile({%{{.*}} : i32}) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop tile(tileSize, tileSize) DO i = 1, n @@ -231,7 +231,7 @@ program acc_loop ! CHECK: acc.loop {{.*}} tile({%{{.*}} : i32, %{{.*}} : i32}) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop collapse(2) DO i = 1, n @@ -244,7 +244,7 @@ program acc_loop ! CHECK: fir.store %arg0 to %{{.*}} : !fir.ref ! CHECK: fir.store %arg1 to %{{.*}} : !fir.ref ! CHECK: acc.yield -! CHECK-NEXT: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type], inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type]{{.*}}} !$acc loop DO i = 1, n @@ -257,9 +257,9 @@ program acc_loop ! CHECK: acc.loop {{.*}} control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.loop {{.*}} control(%arg1 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop reduction(+:reduction_r) reduction(*:reduction_i) do i = 1, n @@ -269,7 +269,7 @@ program acc_loop ! CHECK: acc.loop private(@privatization_ref_i32 -> %{{.*}} : !fir.ref) reduction(@reduction_add_ref_f32 -> %{{.*}} : !fir.ref, @reduction_mul_ref_i32 -> %{{.*}} : !fir.ref) control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop gang(dim: gangDim, static: gangStatic) DO i = 1, n @@ -278,7 +278,7 @@ program acc_loop ! CHECK: acc.loop gang({dim=%{{.*}}, static=%{{.*}} : i32}) {{.*}} control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop gang(dim: 1) DO i = 1, n @@ -287,7 +287,7 @@ program acc_loop ! CHECK: acc.loop gang({dim={{.*}} : i32}) {{.*}} control(%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} !$acc loop DO i = 1, n @@ -335,4 +335,4 @@ subroutine sub1(i, j, k) ! CHECK: %[[P_J:.*]] = acc.private varPtr(%[[DC_J]] : !fir.ref) -> !fir.ref {implicit = true, name = "j"} ! CHECK: %[[P_K:.*]] = acc.private varPtr(%[[DC_K]] : !fir.ref) -> !fir.ref {implicit = true, name = "k"} ! CHECK: acc.loop combined(parallel) private(@privatization_ref_i32 -> %[[P_I]] : !fir.ref, @privatization_ref_i32 -> %[[P_J]] : !fir.ref, @privatization_ref_i32 -> %[[P_K]] : !fir.ref) control(%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%c10{{.*}}, %c100{{.*}}, %c200{{.*}} : i32, i32, i32) step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) -! CHECK: } attributes {inclusiveUpperbound = array} +! CHECK: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} diff --git a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 b/flang/test/Lower/OpenACC/acc-parallel-loop.f90 index 4cf268d2517f5..32060179acdf1 100644 --- a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 +++ b/flang/test/Lower/OpenACC/acc-parallel-loop.f90 @@ -49,7 +49,7 @@ subroutine acc_parallel_loop ! CHECK: acc.parallel { ! CHECK: acc.loop private{{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {{{.*}}independent = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -61,7 +61,7 @@ subroutine acc_parallel_loop ! CHECK: acc.parallel combined(loop) { ! CHECK: acc.loop combined(parallel) private{{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {{{.*}}independent = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -505,7 +505,7 @@ subroutine acc_parallel_loop ! CHECK: acc.parallel {{.*}} { ! CHECK: acc.loop {{.*}} gang ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array}{{$}} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -518,7 +518,7 @@ subroutine acc_parallel_loop ! CHECK: [[GANGNUM1:%.*]] = arith.constant 8 : i32 ! CHECK: acc.loop {{.*}} gang({num=[[GANGNUM1]] : i32}) ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -531,7 +531,7 @@ subroutine acc_parallel_loop ! CHECK: [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref ! CHECK: acc.loop {{.*}} gang({num=[[GANGNUM2]] : i32}) ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -543,7 +543,7 @@ subroutine acc_parallel_loop ! CHECK: acc.parallel {{.*}} { ! CHECK: acc.loop {{.*}} gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -555,7 +555,7 @@ subroutine acc_parallel_loop ! CHECK: acc.parallel {{.*}} { ! CHECK: acc.loop {{.*}} vector ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array}{{$}} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -568,7 +568,7 @@ subroutine acc_parallel_loop ! CHECK: [[CONSTANT128:%.*]] = arith.constant 128 : i32 ! CHECK: acc.loop {{.*}} vector([[CONSTANT128]] : i32) {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -582,7 +582,7 @@ subroutine acc_parallel_loop ! CHECK: acc.loop {{.*}} vector([[VECTORLENGTH]] : i32) {{.*}} { ! CHECK-NOT: fir.do_loop ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -595,7 +595,7 @@ subroutine acc_parallel_loop ! CHECK: acc.loop {{.*}} worker {{.*}} { ! CHECK-NOT: fir.do_loop ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array}{{$}} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -609,7 +609,7 @@ subroutine acc_parallel_loop ! CHECK: acc.loop {{.*}} worker([[WORKER128]] : i32) {{.*}} { ! CHECK-NOT: fir.do_loop ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -623,7 +623,7 @@ subroutine acc_parallel_loop ! CHECK: acc.parallel {{.*}} { ! CHECK: acc.loop {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type], inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {{{.*}}collapse = [2], collapseDeviceType = [#acc.device_type]{{.*}}} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -639,9 +639,9 @@ subroutine acc_parallel_loop ! CHECK: acc.loop {{.*}} { ! CHECK: acc.loop {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {{{.*}}independent = [#acc.device_type]} ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {{{.*}}independent = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} diff --git a/flang/test/Lower/OpenACC/acc-serial-loop.f90 b/flang/test/Lower/OpenACC/acc-serial-loop.f90 index 34391f78ae707..af7bb0fac158c 100644 --- a/flang/test/Lower/OpenACC/acc-serial-loop.f90 +++ b/flang/test/Lower/OpenACC/acc-serial-loop.f90 @@ -68,7 +68,7 @@ subroutine acc_serial_loop ! CHECK: acc.serial { ! CHECK: acc.loop private{{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {{{.*}}seq = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -80,7 +80,7 @@ subroutine acc_serial_loop ! CHECK: acc.serial combined(loop) { ! CHECK: acc.loop combined(serial) private{{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {{{.*}}seq = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -446,7 +446,7 @@ subroutine acc_serial_loop ! CHECK: acc.serial {{.*}} { ! CHECK: acc.loop {{.*}} gang {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array}{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -459,7 +459,7 @@ subroutine acc_serial_loop ! CHECK: [[GANGNUM1:%.*]] = arith.constant 8 : i32 ! CHECK: acc.loop {{.*}} gang({num=[[GANGNUM1]] : i32}) {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -472,7 +472,7 @@ subroutine acc_serial_loop ! CHECK: [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref ! CHECK: acc.loop {{.*}} gang({num=[[GANGNUM2]] : i32}) {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -484,7 +484,7 @@ subroutine acc_serial_loop ! CHECK: acc.serial {{.*}} { ! CHECK: acc.loop {{.*}} gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -496,7 +496,7 @@ subroutine acc_serial_loop ! CHECK: acc.serial {{.*}} { ! CHECK: acc.loop {{.*}} vector {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array}{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -509,7 +509,7 @@ subroutine acc_serial_loop ! CHECK: [[CONSTANT128:%.*]] = arith.constant 128 : i32 ! CHECK: acc.loop {{.*}} vector([[CONSTANT128]] : i32) {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -522,7 +522,7 @@ subroutine acc_serial_loop ! CHECK: [[VECTORLENGTH:%.*]] = fir.load %{{.*}} : !fir.ref ! CHECK: acc.loop {{.*}} vector([[VECTORLENGTH]] : i32) {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -534,7 +534,7 @@ subroutine acc_serial_loop ! CHECK: acc.serial {{.*}} { ! CHECK: acc.loop {{.*}} worker {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: } attributes {inclusiveUpperbound = array}{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -547,7 +547,7 @@ subroutine acc_serial_loop ! CHECK: [[WORKER128:%.*]] = arith.constant 128 : i32 ! CHECK: acc.loop {{.*}} worker([[WORKER128]] : i32) {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {auto_ = [#acc.device_type], inclusiveUpperbound = array} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -562,7 +562,7 @@ subroutine acc_serial_loop ! CHECK: acc.loop {{.*}} { ! CHECK-NOT: fir.do_loop ! CHECK: acc.yield -! CHECK-NEXT: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type], inclusiveUpperbound = array} +! CHECK-NEXT: } attributes {{{.*}}collapse = [2], collapseDeviceType = [#acc.device_type]{{.*}}} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} @@ -578,9 +578,9 @@ subroutine acc_serial_loop ! CHECK: acc.loop {{.*}} { ! CHECK: acc.loop {{.*}} { ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {{{.*}}seq = [#acc.device_type]} ! CHECK: acc.yield -! CHECK-NEXT: }{{$}} +! CHECK-NEXT: } attributes {{{.*}}seq = [#acc.device_type]} ! CHECK: acc.yield ! CHECK-NEXT: }{{$}} diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h index ff5845343313c..4eb666239d4e4 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h @@ -151,6 +151,12 @@ mlir::ValueRange getDataOperands(mlir::Operation *accOp); /// Used to get a mutable range iterating over the data operands. mlir::MutableOperandRange getMutableDataOperands(mlir::Operation *accOp); +/// Used to obtain the enclosing compute construct operation that contains +/// the provided `region`. Returns nullptr if no compute construct operation +/// is found. The returns operation is one of types defined by +///`ACC_COMPUTE_CONSTRUCT_OPS`. +mlir::Operation *getEnclosingComputeOp(mlir::Region ®ion); + /// Used to check whether the provided `type` implements the `PointerLikeType` /// interface. inline bool isPointerLikeType(mlir::Type type) { diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index 658ad28477ace..c72ec47be9f04 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -3820,3 +3820,14 @@ mlir::acc::getMutableDataOperands(mlir::Operation *accOp) { .Default([&](mlir::Operation *) { return nullptr; })}; return dataOperands; } + +mlir::Operation *mlir::acc::getEnclosingComputeOp(mlir::Region ®ion) { + mlir::Operation *parentOp = region.getParentOp(); + while (parentOp) { + if (mlir::isa(parentOp)) { + return parentOp; + } + parentOp = parentOp->getParentOp(); + } + return nullptr; +} From b3db0c6a1d063ec9ee15253bde3d428c0ad5968b Mon Sep 17 00:00:00 2001 From: Steven Perron Date: Wed, 11 Jun 2025 10:30:38 -0400 Subject: [PATCH 086/851] [HLSL][Driver] Make vk1.3 the default. (#143384) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The HLSL driver currently defaults the triple to an unversioned os and subarch when targeting SPIR-V. This means the SPIR-V backend decides the default value. That is not a great option because a change the backend could cause a change in Clang. Now that we want to choose the default we need to consider the best option. DXC currently defaults to Vulkan1.0. We are planning on not supporting Vulkan1.0 in the Clang HLSL compiler because it is newer versions of Vulkan are commonly supported on nearly all hardware, so users do not use it. Since we have to change from DXC anyway, we are using VK1.3. It has been out long enough to be commonly available, and the initial implementation of SPIR-V features for HLSL are assuming Vulkan 1.3. --------- Co-authored-by: Nathan Gauër --- clang/lib/Driver/Driver.cpp | 27 +++++++++++++-------------- clang/test/Driver/dxc_spirv.hlsl | 2 +- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp index 80728daca03c9..eb60d907d2218 100644 --- a/clang/lib/Driver/Driver.cpp +++ b/clang/lib/Driver/Driver.cpp @@ -1596,28 +1596,27 @@ Compilation *Driver::BuildCompilation(ArrayRef ArgList) { A->claim(); if (Args.hasArg(options::OPT_spirv)) { + const llvm::StringMap ValidTargets = { + {"vulkan1.2", llvm::Triple::SPIRVSubArch_v15}, + {"vulkan1.3", llvm::Triple::SPIRVSubArch_v16}}; llvm::Triple T(TargetTriple); - T.setArch(llvm::Triple::spirv); - T.setOS(llvm::Triple::Vulkan); - // Set specific Vulkan version if applicable. + // Set specific Vulkan version. Default to vulkan1.3. + auto TargetInfo = ValidTargets.find("vulkan1.3"); + assert(TargetInfo != ValidTargets.end()); if (const Arg *A = Args.getLastArg(options::OPT_fspv_target_env_EQ)) { - const llvm::StringMap ValidTargets = { - {"vulkan1.2", llvm::Triple::SPIRVSubArch_v15}, - {"vulkan1.3", llvm::Triple::SPIRVSubArch_v16}}; - - auto TargetInfo = ValidTargets.find(A->getValue()); - if (TargetInfo != ValidTargets.end()) { - T.setOSName(TargetInfo->getKey()); - T.setArch(llvm::Triple::spirv, TargetInfo->getValue()); - } else { + TargetInfo = ValidTargets.find(A->getValue()); + if (TargetInfo == ValidTargets.end()) { Diag(diag::err_drv_invalid_value) << A->getAsString(Args) << A->getValue(); } A->claim(); } - - TargetTriple = T.str(); + if (TargetInfo != ValidTargets.end()) { + T.setOSName(TargetInfo->getKey()); + T.setArch(llvm::Triple::spirv, TargetInfo->getValue()); + TargetTriple = T.str(); + } } } else { Diag(diag::err_drv_dxc_missing_target_profile); diff --git a/clang/test/Driver/dxc_spirv.hlsl b/clang/test/Driver/dxc_spirv.hlsl index e6624e5f1b3f6..65c9018dc54c5 100644 --- a/clang/test/Driver/dxc_spirv.hlsl +++ b/clang/test/Driver/dxc_spirv.hlsl @@ -3,7 +3,7 @@ // RUN: %clang_dxc -T cs_6_0 -spirv -fspv-target-env=vulkan1.3 -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-VULKAN13 // RUN: not %clang_dxc -T cs_6_0 -spirv -fspv-target-env=vulkan1.0 -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR -// CHECK: "-triple" "spirv-unknown-vulkan-compute" +// CHECK: "-triple" "spirv1.6-unknown-vulkan1.3-compute" // CHECK-SAME: "-x" "hlsl" // CHECK-VULKAN12: "-triple" "spirv1.5-unknown-vulkan1.2-compute" From 4e441665cc0d1585c8c6e44cf3c71a055f597d2e Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Wed, 11 Jun 2025 14:47:47 +0200 Subject: [PATCH 087/851] [BasicAA][ValueTracking] Use MaxLookupSearchDepth constant (NFC) Use MaxLookupSearchDepth in all places limiting an underlying object walk, instead of hardcoding 6 in various places. --- llvm/include/llvm/Analysis/ValueTracking.h | 13 +++++++++---- llvm/lib/Analysis/BasicAliasAnalysis.cpp | 4 ---- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index b05b8f349b8d5..32ab9733d13c9 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -45,6 +45,10 @@ template class ArrayRef; constexpr unsigned MaxAnalysisRecursionDepth = 6; +/// The max limit of the search depth in DecomposeGEPExpression() and +/// getUnderlyingObject(). +constexpr unsigned MaxLookupSearchDepth = 6; + /// Determine which bits of V are known to be either zero or one and return /// them in the KnownZero/KnownOne bit sets. /// @@ -432,9 +436,10 @@ LLVM_ABI bool isIntrinsicReturningPointerAliasingArgumentWithoutCapturing( /// original object being addressed. Note that the returned value has pointer /// type if the specified value does. If the \p MaxLookup value is non-zero, it /// limits the number of instructions to be stripped off. -LLVM_ABI const Value *getUnderlyingObject(const Value *V, - unsigned MaxLookup = 6); -inline Value *getUnderlyingObject(Value *V, unsigned MaxLookup = 6) { +LLVM_ABI const Value * +getUnderlyingObject(const Value *V, unsigned MaxLookup = MaxLookupSearchDepth); +inline Value *getUnderlyingObject(Value *V, + unsigned MaxLookup = MaxLookupSearchDepth) { // Force const to avoid infinite recursion. const Value *VConst = V; return const_cast(getUnderlyingObject(VConst, MaxLookup)); @@ -475,7 +480,7 @@ LLVM_ABI const Value *getUnderlyingObjectAggressive(const Value *V); LLVM_ABI void getUnderlyingObjects(const Value *V, SmallVectorImpl &Objects, const LoopInfo *LI = nullptr, - unsigned MaxLookup = 6); + unsigned MaxLookup = MaxLookupSearchDepth); /// This is a wrapper around getUnderlyingObjects and adds support for basic /// ptrtoint+arithmetic+inttoptr sequences. diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp index e6675256fd5a0..f862d6930f545 100644 --- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp +++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp @@ -79,10 +79,6 @@ STATISTIC(SearchLimitReached, "Number of times the limit to " "decompose GEPs is reached"); STATISTIC(SearchTimes, "Number of times a GEP is decomposed"); -// The max limit of the search depth in DecomposeGEPExpression() and -// getUnderlyingObject(). -static const unsigned MaxLookupSearchDepth = 6; - bool BasicAAResult::invalidate(Function &Fn, const PreservedAnalyses &PA, FunctionAnalysisManager::Invalidator &Inv) { // We don't care if this analysis itself is preserved, it has no state. But From 10f512f7bbda076ca2a0f9e3fcb2e7be0cb07199 Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Wed, 11 Jun 2025 07:55:06 -0700 Subject: [PATCH 088/851] Revert runtime work queue patch, it breaks some tests that need investigation (#143713) Revert "[flang][runtime] Another try to fix build failure" This reverts commit 13869cac2b5051e453aa96ad71220d9d33404620. Revert "[flang][runtime] Fix build bot flang-runtime-cuda-gcc errors (#143650)" This reverts commit d75e28477af0baa063a4d4cc7b3cf657cfadd758. Revert "[flang][runtime] Replace recursion with iterative work queue (#137727)" This reverts commit 163c67ad3d1bf7af6590930d8f18700d65ad4564. --- .../include/flang-rt/runtime/environment.h | 3 - flang-rt/include/flang-rt/runtime/stat.h | 10 +- flang-rt/include/flang-rt/runtime/type-info.h | 2 - .../include/flang-rt/runtime/work-queue.h | 552 --------------- flang-rt/lib/runtime/CMakeLists.txt | 2 - flang-rt/lib/runtime/assign.cpp | 623 ++++++----------- flang-rt/lib/runtime/derived.cpp | 517 +++++++------- flang-rt/lib/runtime/descriptor-io.cpp | 651 +----------------- flang-rt/lib/runtime/descriptor-io.h | 620 ++++++++++++++++- flang-rt/lib/runtime/environment.cpp | 4 - flang-rt/lib/runtime/namelist.cpp | 1 - flang-rt/lib/runtime/tools.cpp | 4 +- flang-rt/lib/runtime/type-info.cpp | 6 +- flang-rt/lib/runtime/work-queue.cpp | 161 ----- flang-rt/unittests/Runtime/ExternalIOTest.cpp | 2 +- flang/docs/Extensions.md | 10 - flang/include/flang/Runtime/assign.h | 2 +- flang/include/flang/Semantics/tools.h | 7 +- flang/lib/Semantics/runtime-type-info.cpp | 4 - flang/lib/Semantics/tools.cpp | 32 - flang/module/__fortran_type_info.f90 | 3 +- flang/test/Lower/volatile-openmp.f90 | 8 +- flang/test/Semantics/typeinfo01.f90 | 30 +- flang/test/Semantics/typeinfo03.f90 | 2 +- flang/test/Semantics/typeinfo04.f90 | 8 +- flang/test/Semantics/typeinfo05.f90 | 4 +- flang/test/Semantics/typeinfo06.f90 | 4 +- flang/test/Semantics/typeinfo07.f90 | 8 +- flang/test/Semantics/typeinfo08.f90 | 2 +- flang/test/Semantics/typeinfo11.f90 | 2 +- flang/test/Semantics/typeinfo12.f90 | 67 -- 31 files changed, 1120 insertions(+), 2231 deletions(-) delete mode 100644 flang-rt/include/flang-rt/runtime/work-queue.h delete mode 100644 flang-rt/lib/runtime/work-queue.cpp delete mode 100644 flang/test/Semantics/typeinfo12.f90 diff --git a/flang-rt/include/flang-rt/runtime/environment.h b/flang-rt/include/flang-rt/runtime/environment.h index e579f6012ce86..16258b3bbba9b 100644 --- a/flang-rt/include/flang-rt/runtime/environment.h +++ b/flang-rt/include/flang-rt/runtime/environment.h @@ -64,9 +64,6 @@ struct ExecutionEnvironment { bool defaultUTF8{false}; // DEFAULT_UTF8 bool checkPointerDeallocation{true}; // FORT_CHECK_POINTER_DEALLOCATION - enum InternalDebugging { WorkQueue = 1 }; - int internalDebugging{0}; // FLANG_RT_DEBUG - // CUDA related variables std::size_t cudaStackLimit{0}; // ACC_OFFLOAD_STACK_SIZE bool cudaDeviceIsManaged{false}; // NV_CUDAFOR_DEVICE_IS_MANAGED diff --git a/flang-rt/include/flang-rt/runtime/stat.h b/flang-rt/include/flang-rt/runtime/stat.h index dc372de53506a..070d0bf8673fb 100644 --- a/flang-rt/include/flang-rt/runtime/stat.h +++ b/flang-rt/include/flang-rt/runtime/stat.h @@ -24,7 +24,7 @@ class Terminator; enum Stat { StatOk = 0, // required to be zero by Fortran - // Interoperable STAT= codes (>= 11) + // Interoperable STAT= codes StatBaseNull = CFI_ERROR_BASE_ADDR_NULL, StatBaseNotNull = CFI_ERROR_BASE_ADDR_NOT_NULL, StatInvalidElemLen = CFI_INVALID_ELEM_LEN, @@ -36,7 +36,7 @@ enum Stat { StatMemAllocation = CFI_ERROR_MEM_ALLOCATION, StatOutOfBounds = CFI_ERROR_OUT_OF_BOUNDS, - // Standard STAT= values (>= 101) + // Standard STAT= values StatFailedImage = FORTRAN_RUNTIME_STAT_FAILED_IMAGE, StatLocked = FORTRAN_RUNTIME_STAT_LOCKED, StatLockedOtherImage = FORTRAN_RUNTIME_STAT_LOCKED_OTHER_IMAGE, @@ -49,14 +49,10 @@ enum Stat { // Additional "processor-defined" STAT= values StatInvalidArgumentNumber = FORTRAN_RUNTIME_STAT_INVALID_ARG_NUMBER, StatMissingArgument = FORTRAN_RUNTIME_STAT_MISSING_ARG, - StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT, // -1 + StatValueTooShort = FORTRAN_RUNTIME_STAT_VALUE_TOO_SHORT, StatMoveAllocSameAllocatable = FORTRAN_RUNTIME_STAT_MOVE_ALLOC_SAME_ALLOCATABLE, StatBadPointerDeallocation = FORTRAN_RUNTIME_STAT_BAD_POINTER_DEALLOCATION, - - // Dummy status for work queue continuation, declared here to perhaps - // avoid collisions - StatContinue = 201 }; RT_API_ATTRS const char *StatErrorString(int); diff --git a/flang-rt/include/flang-rt/runtime/type-info.h b/flang-rt/include/flang-rt/runtime/type-info.h index 9bde3adba87f5..5e79efde164f2 100644 --- a/flang-rt/include/flang-rt/runtime/type-info.h +++ b/flang-rt/include/flang-rt/runtime/type-info.h @@ -240,7 +240,6 @@ class DerivedType { RT_API_ATTRS bool noFinalizationNeeded() const { return noFinalizationNeeded_; } - RT_API_ATTRS bool noDefinedAssignment() const { return noDefinedAssignment_; } RT_API_ATTRS std::size_t LenParameters() const { return lenParameterKind().Elements(); @@ -323,7 +322,6 @@ class DerivedType { bool noInitializationNeeded_{false}; bool noDestructionNeeded_{false}; bool noFinalizationNeeded_{false}; - bool noDefinedAssignment_{false}; }; } // namespace Fortran::runtime::typeInfo diff --git a/flang-rt/include/flang-rt/runtime/work-queue.h b/flang-rt/include/flang-rt/runtime/work-queue.h deleted file mode 100644 index f8cc820c06ca1..0000000000000 --- a/flang-rt/include/flang-rt/runtime/work-queue.h +++ /dev/null @@ -1,552 +0,0 @@ -//===-- include/flang-rt/runtime/work-queue.h -------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -// Internal runtime utilities for work queues that replace the use of recursion -// for better GPU device support. -// -// A work queue comprises a list of tickets. Each ticket class has a Begin() -// member function, which is called once, and a Continue() member function -// that can be called zero or more times. A ticket's execution terminates -// when either of these member functions returns a status other than -// StatContinue. When that status is not StatOk, then the whole queue -// is shut down. -// -// By returning StatContinue from its Continue() member function, -// a ticket suspends its execution so that any nested tickets that it -// may have created can be run to completion. It is the reponsibility -// of each ticket class to maintain resumption information in its state -// and manage its own progress. Most ticket classes inherit from -// class ComponentsOverElements, which implements an outer loop over all -// components of a derived type, and an inner loop over all elements -// of a descriptor, possibly with multiple phases of execution per element. -// -// Tickets are created by WorkQueue::Begin...() member functions. -// There is one of these for each "top level" recursive function in the -// Fortran runtime support library that has been restructured into this -// ticket framework. -// -// When the work queue is running tickets, it always selects the last ticket -// on the list for execution -- "work stack" might have been a more accurate -// name for this framework. This ticket may, while doing its job, create -// new tickets, and since those are pushed after the active one, the first -// such nested ticket will be the next one executed to completion -- i.e., -// the order of nested WorkQueue::Begin...() calls is respected. -// Note that a ticket's Continue() member function won't be called again -// until all nested tickets have run to completion and it is once again -// the last ticket on the queue. -// -// Example for an assignment to a derived type: -// 1. Assign() is called, and its work queue is created. It calls -// WorkQueue::BeginAssign() and then WorkQueue::Run(). -// 2. Run calls AssignTicket::Begin(), which pushes a tickets via -// BeginFinalize() and returns StatContinue. -// 3. FinalizeTicket::Begin() and FinalizeTicket::Continue() are called -// until one of them returns StatOk, which ends the finalization ticket. -// 4. AssignTicket::Continue() is then called; it creates a DerivedAssignTicket -// and then returns StatOk, which ends the ticket. -// 5. At this point, only one ticket remains. DerivedAssignTicket::Begin() -// and ::Continue() are called until they are done (not StatContinue). -// Along the way, it may create nested AssignTickets for components, -// and suspend itself so that they may each run to completion. - -#ifndef FLANG_RT_RUNTIME_WORK_QUEUE_H_ -#define FLANG_RT_RUNTIME_WORK_QUEUE_H_ - -#include "flang-rt/runtime/connection.h" -#include "flang-rt/runtime/descriptor.h" -#include "flang-rt/runtime/stat.h" -#include "flang-rt/runtime/type-info.h" -#include "flang/Common/api-attrs.h" -#include "flang/Runtime/freestanding-tools.h" -#include - -namespace Fortran::runtime::io { -class IoStatementState; -struct NonTbpDefinedIoTable; -} // namespace Fortran::runtime::io - -namespace Fortran::runtime { -class Terminator; -class WorkQueue; - -// Ticket worker base classes - -template class ImmediateTicketRunner { -public: - RT_API_ATTRS explicit ImmediateTicketRunner(TICKET &ticket) - : ticket_{ticket} {} - RT_API_ATTRS int Run(WorkQueue &workQueue) { - int status{ticket_.Begin(workQueue)}; - while (status == StatContinue) { - status = ticket_.Continue(workQueue); - } - return status; - } - -private: - TICKET &ticket_; -}; - -// Base class for ticket workers that operate elementwise over descriptors -class Elementwise { -public: - RT_API_ATTRS Elementwise( - const Descriptor &instance, const Descriptor *from = nullptr) - : instance_{instance}, from_{from} { - instance_.GetLowerBounds(subscripts_); - if (from_) { - from_->GetLowerBounds(fromSubscripts_); - } - } - RT_API_ATTRS bool IsComplete() const { return elementAt_ >= elements_; } - RT_API_ATTRS void Advance() { - ++elementAt_; - instance_.IncrementSubscripts(subscripts_); - if (from_) { - from_->IncrementSubscripts(fromSubscripts_); - } - } - RT_API_ATTRS void SkipToEnd() { elementAt_ = elements_; } - RT_API_ATTRS void Reset() { - elementAt_ = 0; - instance_.GetLowerBounds(subscripts_); - if (from_) { - from_->GetLowerBounds(fromSubscripts_); - } - } - -protected: - const Descriptor &instance_, *from_{nullptr}; - std::size_t elements_{instance_.Elements()}; - std::size_t elementAt_{0}; - SubscriptValue subscripts_[common::maxRank]; - SubscriptValue fromSubscripts_[common::maxRank]; -}; - -// Base class for ticket workers that operate over derived type components. -class Componentwise { -public: - RT_API_ATTRS Componentwise(const typeInfo::DerivedType &); - RT_API_ATTRS bool IsComplete() const { return componentAt_ >= components_; } - RT_API_ATTRS void Advance() { - ++componentAt_; - GetComponent(); - } - RT_API_ATTRS void SkipToEnd() { - component_ = nullptr; - componentAt_ = components_; - } - RT_API_ATTRS void Reset() { - component_ = nullptr; - componentAt_ = 0; - GetComponent(); - } - RT_API_ATTRS void GetComponent(); - -protected: - const typeInfo::DerivedType &derived_; - std::size_t components_{0}, componentAt_{0}; - const typeInfo::Component *component_{nullptr}; - StaticDescriptor componentDescriptor_; -}; - -// Base class for ticket workers that operate over derived type components -// in an outer loop, and elements in an inner loop. -class ComponentsOverElements : public Componentwise, public Elementwise { -public: - RT_API_ATTRS ComponentsOverElements(const Descriptor &instance, - const typeInfo::DerivedType &derived, const Descriptor *from = nullptr) - : Componentwise{derived}, Elementwise{instance, from} { - if (Elementwise::IsComplete()) { - Componentwise::SkipToEnd(); - } - } - RT_API_ATTRS bool IsComplete() const { return Componentwise::IsComplete(); } - RT_API_ATTRS void Advance() { - SkipToNextElement(); - if (Elementwise::IsComplete()) { - Elementwise::Reset(); - Componentwise::Advance(); - } - } - RT_API_ATTRS void SkipToNextElement() { - phase_ = 0; - Elementwise::Advance(); - } - RT_API_ATTRS void SkipToNextComponent() { - phase_ = 0; - Elementwise::Reset(); - Componentwise::Advance(); - } - RT_API_ATTRS void Reset() { - phase_ = 0; - Elementwise::Reset(); - Componentwise::Reset(); - } - -protected: - int phase_{0}; -}; - -// Base class for ticket workers that operate over elements in an outer loop, -// type components in an inner loop. -class ElementsOverComponents : public Elementwise, public Componentwise { -public: - RT_API_ATTRS ElementsOverComponents(const Descriptor &instance, - const typeInfo::DerivedType &derived, const Descriptor *from = nullptr) - : Elementwise{instance, from}, Componentwise{derived} { - if (Componentwise::IsComplete()) { - Elementwise::SkipToEnd(); - } - } - RT_API_ATTRS bool IsComplete() const { return Elementwise::IsComplete(); } - RT_API_ATTRS void Advance() { - SkipToNextComponent(); - if (Componentwise::IsComplete()) { - Componentwise::Reset(); - Elementwise::Advance(); - } - } - RT_API_ATTRS void SkipToNextComponent() { - phase_ = 0; - Componentwise::Advance(); - } - RT_API_ATTRS void SkipToNextElement() { - phase_ = 0; - Componentwise::Reset(); - Elementwise::Advance(); - } - -protected: - int phase_{0}; -}; - -// Ticket worker classes - -// Implements derived type instance initialization -class InitializeTicket : public ImmediateTicketRunner, - private ComponentsOverElements { -public: - RT_API_ATTRS InitializeTicket( - const Descriptor &instance, const typeInfo::DerivedType &derived) - : ImmediateTicketRunner{*this}, - ComponentsOverElements{instance, derived} {} - RT_API_ATTRS int Begin(WorkQueue &); - RT_API_ATTRS int Continue(WorkQueue &); -}; - -// Initializes one derived type instance from the value of another -class InitializeCloneTicket - : public ImmediateTicketRunner, - private ComponentsOverElements { -public: - RT_API_ATTRS InitializeCloneTicket(const Descriptor &clone, - const Descriptor &original, const typeInfo::DerivedType &derived, - bool hasStat, const Descriptor *errMsg) - : ImmediateTicketRunner{*this}, - ComponentsOverElements{original, derived}, clone_{clone}, - hasStat_{hasStat}, errMsg_{errMsg} {} - RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; } - RT_API_ATTRS int Continue(WorkQueue &); - -private: - const Descriptor &clone_; - bool hasStat_{false}; - const Descriptor *errMsg_{nullptr}; - StaticDescriptor cloneComponentDescriptor_; -}; - -// Implements derived type instance finalization -class FinalizeTicket : public ImmediateTicketRunner, - private ComponentsOverElements { -public: - RT_API_ATTRS FinalizeTicket( - const Descriptor &instance, const typeInfo::DerivedType &derived) - : ImmediateTicketRunner{*this}, - ComponentsOverElements{instance, derived} {} - RT_API_ATTRS int Begin(WorkQueue &); - RT_API_ATTRS int Continue(WorkQueue &); - -private: - const typeInfo::DerivedType *finalizableParentType_{nullptr}; -}; - -// Implements derived type instance destruction -class DestroyTicket : public ImmediateTicketRunner, - private ComponentsOverElements { -public: - RT_API_ATTRS DestroyTicket(const Descriptor &instance, - const typeInfo::DerivedType &derived, bool finalize) - : ImmediateTicketRunner{*this}, - ComponentsOverElements{instance, derived}, finalize_{finalize} {} - RT_API_ATTRS int Begin(WorkQueue &); - RT_API_ATTRS int Continue(WorkQueue &); - -private: - bool finalize_{false}; -}; - -// Implements general intrinsic assignment -class AssignTicket : public ImmediateTicketRunner { -public: - RT_API_ATTRS AssignTicket( - Descriptor &to, const Descriptor &from, int flags, MemmoveFct memmoveFct) - : ImmediateTicketRunner{*this}, to_{to}, from_{&from}, - flags_{flags}, memmoveFct_{memmoveFct} {} - RT_API_ATTRS int Begin(WorkQueue &); - RT_API_ATTRS int Continue(WorkQueue &); - -private: - RT_API_ATTRS bool IsSimpleMemmove() const { - return !toDerived_ && to_.rank() == from_->rank() && to_.IsContiguous() && - from_->IsContiguous() && to_.ElementBytes() == from_->ElementBytes(); - } - RT_API_ATTRS Descriptor &GetTempDescriptor(); - - Descriptor &to_; - const Descriptor *from_{nullptr}; - int flags_{0}; // enum AssignFlags - MemmoveFct memmoveFct_{nullptr}; - StaticDescriptor tempDescriptor_; - const typeInfo::DerivedType *toDerived_{nullptr}; - Descriptor *toDeallocate_{nullptr}; - bool persist_{false}; - bool done_{false}; -}; - -// Implements derived type intrinsic assignment. -template -class DerivedAssignTicket - : public ImmediateTicketRunner>, - private std::conditional_t { -public: - using Base = std::conditional_t; - RT_API_ATTRS DerivedAssignTicket(const Descriptor &to, const Descriptor &from, - const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct, - Descriptor *deallocateAfter) - : ImmediateTicketRunner{*this}, - Base{to, derived, &from}, flags_{flags}, memmoveFct_{memmoveFct}, - deallocateAfter_{deallocateAfter} {} - RT_API_ATTRS int Begin(WorkQueue &); - RT_API_ATTRS int Continue(WorkQueue &); - -private: - static constexpr bool isComponentwise_{IS_COMPONENTWISE}; - bool toIsContiguous_{this->instance_.IsContiguous()}; - bool fromIsContiguous_{this->from_->IsContiguous()}; - int flags_{0}; - MemmoveFct memmoveFct_{nullptr}; - Descriptor *deallocateAfter_{nullptr}; - StaticDescriptor fromComponentDescriptor_; -}; - -namespace io::descr { - -template -class DescriptorIoTicket - : public ImmediateTicketRunner>, - private Elementwise { -public: - RT_API_ATTRS DescriptorIoTicket(io::IoStatementState &io, - const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table, - bool &anyIoTookPlace) - : ImmediateTicketRunner(*this), - Elementwise{descriptor}, io_{io}, table_{table}, - anyIoTookPlace_{anyIoTookPlace} {} - RT_API_ATTRS int Begin(WorkQueue &); - RT_API_ATTRS int Continue(WorkQueue &); - RT_API_ATTRS bool &anyIoTookPlace() { return anyIoTookPlace_; } - -private: - io::IoStatementState &io_; - const io::NonTbpDefinedIoTable *table_{nullptr}; - bool &anyIoTookPlace_; - common::optional nonTbpSpecial_; - const typeInfo::DerivedType *derived_{nullptr}; - const typeInfo::SpecialBinding *special_{nullptr}; - StaticDescriptor elementDescriptor_; -}; - -template -class DerivedIoTicket : public ImmediateTicketRunner>, - private ElementsOverComponents { -public: - RT_API_ATTRS DerivedIoTicket(io::IoStatementState &io, - const Descriptor &descriptor, const typeInfo::DerivedType &derived, - const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace) - : ImmediateTicketRunner(*this), - ElementsOverComponents{descriptor, derived}, io_{io}, table_{table}, - anyIoTookPlace_{anyIoTookPlace} {} - RT_API_ATTRS int Begin(WorkQueue &) { return StatContinue; } - RT_API_ATTRS int Continue(WorkQueue &); - -private: - io::IoStatementState &io_; - const io::NonTbpDefinedIoTable *table_{nullptr}; - bool &anyIoTookPlace_; -}; - -} // namespace io::descr - -struct NullTicket { - RT_API_ATTRS int Begin(WorkQueue &) const { return StatOk; } - RT_API_ATTRS int Continue(WorkQueue &) const { return StatOk; } -}; - -struct Ticket { - RT_API_ATTRS int Continue(WorkQueue &); - bool begun{false}; - std::variant, - DerivedAssignTicket, - io::descr::DescriptorIoTicket, - io::descr::DescriptorIoTicket, - io::descr::DerivedIoTicket, - io::descr::DerivedIoTicket> - u; -}; - -class WorkQueue { -public: - RT_API_ATTRS explicit WorkQueue(Terminator &terminator) - : terminator_{terminator} { - for (int j{1}; j < numStatic_; ++j) { - static_[j].previous = &static_[j - 1]; - static_[j - 1].next = &static_[j]; - } - } - RT_API_ATTRS ~WorkQueue(); - RT_API_ATTRS Terminator &terminator() { return terminator_; }; - - // APIs for particular tasks. These can return StatOk if the work is - // completed immediately. - RT_API_ATTRS int BeginInitialize( - const Descriptor &descriptor, const typeInfo::DerivedType &derived) { - if (runTicketsImmediately_) { - return InitializeTicket{descriptor, derived}.Run(*this); - } else { - StartTicket().u.emplace(descriptor, derived); - return StatContinue; - } - } - RT_API_ATTRS int BeginInitializeClone(const Descriptor &clone, - const Descriptor &original, const typeInfo::DerivedType &derived, - bool hasStat, const Descriptor *errMsg) { - if (runTicketsImmediately_) { - return InitializeCloneTicket{clone, original, derived, hasStat, errMsg} - .Run(*this); - } else { - StartTicket().u.emplace( - clone, original, derived, hasStat, errMsg); - return StatContinue; - } - } - RT_API_ATTRS int BeginFinalize( - const Descriptor &descriptor, const typeInfo::DerivedType &derived) { - if (runTicketsImmediately_) { - return FinalizeTicket{descriptor, derived}.Run(*this); - } else { - StartTicket().u.emplace(descriptor, derived); - return StatContinue; - } - } - RT_API_ATTRS int BeginDestroy(const Descriptor &descriptor, - const typeInfo::DerivedType &derived, bool finalize) { - if (runTicketsImmediately_) { - return DestroyTicket{descriptor, derived, finalize}.Run(*this); - } else { - StartTicket().u.emplace(descriptor, derived, finalize); - return StatContinue; - } - } - RT_API_ATTRS int BeginAssign(Descriptor &to, const Descriptor &from, - int flags, MemmoveFct memmoveFct) { - if (runTicketsImmediately_) { - return AssignTicket{to, from, flags, memmoveFct}.Run(*this); - } else { - StartTicket().u.emplace(to, from, flags, memmoveFct); - return StatContinue; - } - } - template - RT_API_ATTRS int BeginDerivedAssign(Descriptor &to, const Descriptor &from, - const typeInfo::DerivedType &derived, int flags, MemmoveFct memmoveFct, - Descriptor *deallocateAfter) { - if (runTicketsImmediately_) { - return DerivedAssignTicket{ - to, from, derived, flags, memmoveFct, deallocateAfter} - .Run(*this); - } else { - StartTicket().u.emplace>( - to, from, derived, flags, memmoveFct, deallocateAfter); - return StatContinue; - } - } - template - RT_API_ATTRS int BeginDescriptorIo(io::IoStatementState &io, - const Descriptor &descriptor, const io::NonTbpDefinedIoTable *table, - bool &anyIoTookPlace) { - if (runTicketsImmediately_) { - return io::descr::DescriptorIoTicket
{ - io, descriptor, table, anyIoTookPlace} - .Run(*this); - } else { - StartTicket().u.emplace>( - io, descriptor, table, anyIoTookPlace); - return StatContinue; - } - } - template - RT_API_ATTRS int BeginDerivedIo(io::IoStatementState &io, - const Descriptor &descriptor, const typeInfo::DerivedType &derived, - const io::NonTbpDefinedIoTable *table, bool &anyIoTookPlace) { - if (runTicketsImmediately_) { - return io::descr::DerivedIoTicket{ - io, descriptor, derived, table, anyIoTookPlace} - .Run(*this); - } else { - StartTicket().u.emplace>( - io, descriptor, derived, table, anyIoTookPlace); - return StatContinue; - } - } - - RT_API_ATTRS int Run(); - -private: -#if RT_DEVICE_COMPILATION - // Always use the work queue on a GPU device to avoid recursion. - static constexpr bool runTicketsImmediately_{false}; -#else - // Avoid the work queue overhead on the host, unless it needs - // debugging, which is so much easier there. - static constexpr bool runTicketsImmediately_{true}; -#endif - - // Most uses of the work queue won't go very deep. - static constexpr int numStatic_{2}; - - struct TicketList { - bool isStatic{true}; - Ticket ticket; - TicketList *previous{nullptr}, *next{nullptr}; - }; - - RT_API_ATTRS Ticket &StartTicket(); - RT_API_ATTRS void Stop(); - - Terminator &terminator_; - TicketList *first_{nullptr}, *last_{nullptr}, *insertAfter_{nullptr}; - TicketList static_[numStatic_]; - TicketList *firstFree_{static_}; -}; - -} // namespace Fortran::runtime -#endif // FLANG_RT_RUNTIME_WORK_QUEUE_H_ diff --git a/flang-rt/lib/runtime/CMakeLists.txt b/flang-rt/lib/runtime/CMakeLists.txt index 332c0872e065f..a3f63b4315644 100644 --- a/flang-rt/lib/runtime/CMakeLists.txt +++ b/flang-rt/lib/runtime/CMakeLists.txt @@ -68,7 +68,6 @@ set(supported_sources type-info.cpp unit.cpp utf.cpp - work-queue.cpp ) # List of source not used for GPU offloading. @@ -132,7 +131,6 @@ set(gpu_sources type-code.cpp type-info.cpp utf.cpp - work-queue.cpp complex-powi.cpp reduce.cpp reduction.cpp diff --git a/flang-rt/lib/runtime/assign.cpp b/flang-rt/lib/runtime/assign.cpp index 41b130cc8f257..bf67b5dc8b645 100644 --- a/flang-rt/lib/runtime/assign.cpp +++ b/flang-rt/lib/runtime/assign.cpp @@ -14,7 +14,6 @@ #include "flang-rt/runtime/terminator.h" #include "flang-rt/runtime/tools.h" #include "flang-rt/runtime/type-info.h" -#include "flang-rt/runtime/work-queue.h" namespace Fortran::runtime { @@ -103,7 +102,11 @@ static RT_API_ATTRS int AllocateAssignmentLHS( toDim.SetByteStride(stride); stride *= toDim.Extent(); } - return ReturnError(terminator, to.Allocate(kNoAsyncObject)); + int result{ReturnError(terminator, to.Allocate(kNoAsyncObject))}; + if (result == StatOk && derived && !derived->noInitializationNeeded()) { + result = ReturnError(terminator, Initialize(to, *derived, terminator)); + } + return result; } // least <= 0, most >= 0 @@ -228,8 +231,6 @@ static RT_API_ATTRS void BlankPadCharacterAssignment(Descriptor &to, } } -RT_OFFLOAD_API_GROUP_BEGIN - // Common implementation of assignments, both intrinsic assignments and // those cases of polymorphic user-defined ASSIGNMENT(=) TBPs that could not // be resolved in semantics. Most assignment statements do not need any @@ -243,453 +244,275 @@ RT_OFFLOAD_API_GROUP_BEGIN // dealing with array constructors. RT_API_ATTRS void Assign(Descriptor &to, const Descriptor &from, Terminator &terminator, int flags, MemmoveFct memmoveFct) { - WorkQueue workQueue{terminator}; - if (workQueue.BeginAssign(to, from, flags, memmoveFct) == StatContinue) { - workQueue.Run(); - } -} - -RT_API_ATTRS int AssignTicket::Begin(WorkQueue &workQueue) { - bool mustDeallocateLHS{(flags_ & DeallocateLHS) || - MustDeallocateLHS(to_, *from_, workQueue.terminator(), flags_)}; - DescriptorAddendum *toAddendum{to_.Addendum()}; - toDerived_ = toAddendum ? toAddendum->derivedType() : nullptr; - if (toDerived_ && (flags_ & NeedFinalization) && - toDerived_->noFinalizationNeeded()) { - flags_ &= ~NeedFinalization; - } - if (MayAlias(to_, *from_)) { + bool mustDeallocateLHS{(flags & DeallocateLHS) || + MustDeallocateLHS(to, from, terminator, flags)}; + DescriptorAddendum *toAddendum{to.Addendum()}; + const typeInfo::DerivedType *toDerived{ + toAddendum ? toAddendum->derivedType() : nullptr}; + if (toDerived && (flags & NeedFinalization) && + toDerived->noFinalizationNeeded()) { + flags &= ~NeedFinalization; + } + std::size_t toElementBytes{to.ElementBytes()}; + std::size_t fromElementBytes{from.ElementBytes()}; + // The following lambda definition violates the conding style, + // but cuda-11.8 nvcc hits an internal error with the brace initialization. + auto isSimpleMemmove = [&]() { + return !toDerived && to.rank() == from.rank() && to.IsContiguous() && + from.IsContiguous() && toElementBytes == fromElementBytes; + }; + StaticDescriptor deferredDeallocStatDesc; + Descriptor *deferDeallocation{nullptr}; + if (MayAlias(to, from)) { if (mustDeallocateLHS) { - // Convert the LHS into a temporary, then make it look deallocated. - toDeallocate_ = &tempDescriptor_.descriptor(); - persist_ = true; // tempDescriptor_ state must outlive child tickets + deferDeallocation = &deferredDeallocStatDesc.descriptor(); std::memcpy( - reinterpret_cast(toDeallocate_), &to_, to_.SizeInBytes()); - to_.set_base_addr(nullptr); - if (toDerived_ && (flags_ & NeedFinalization)) { - if (int status{workQueue.BeginFinalize(*toDeallocate_, *toDerived_)}; - status != StatOk && status != StatContinue) { - return status; - } - flags_ &= ~NeedFinalization; - } - } else if (!IsSimpleMemmove()) { + reinterpret_cast(deferDeallocation), &to, to.SizeInBytes()); + to.set_base_addr(nullptr); + } else if (!isSimpleMemmove()) { // Handle LHS/RHS aliasing by copying RHS into a temp, then // recursively assigning from that temp. - auto descBytes{from_->SizeInBytes()}; - Descriptor &newFrom{tempDescriptor_.descriptor()}; - persist_ = true; // tempDescriptor_ state must outlive child tickets - std::memcpy(reinterpret_cast(&newFrom), from_, descBytes); + auto descBytes{from.SizeInBytes()}; + StaticDescriptor staticDesc; + Descriptor &newFrom{staticDesc.descriptor()}; + std::memcpy(reinterpret_cast(&newFrom), &from, descBytes); // Pretend the temporary descriptor is for an ALLOCATABLE // entity, otherwise, the Deallocate() below will not // free the descriptor memory. newFrom.raw().attribute = CFI_attribute_allocatable; - if (int stat{ReturnError( - workQueue.terminator(), newFrom.Allocate(kNoAsyncObject))}; - stat != StatOk) { - return stat; - } - if (HasDynamicComponent(*from_)) { - // If 'from' has allocatable/automatic component, we cannot - // just make a shallow copy of the descriptor member. - // This will still leave data overlap in 'to' and 'newFrom'. - // For example: - // type t - // character, allocatable :: c(:) - // end type t - // type(t) :: x(3) - // x(2:3) = x(1:2) - // We have to make a deep copy into 'newFrom' in this case. - if (const DescriptorAddendum *addendum{newFrom.Addendum()}) { - if (const auto *derived{addendum->derivedType()}) { - if (!derived->noInitializationNeeded()) { - if (int status{workQueue.BeginInitialize(newFrom, *derived)}; - status != StatOk && status != StatContinue) { - return status; - } - } - } - } - static constexpr int nestedFlags{MaybeReallocate | PolymorphicLHS}; - if (int status{workQueue.BeginAssign( - newFrom, *from_, nestedFlags, memmoveFct_)}; - status != StatOk && status != StatContinue) { - return status; + auto stat{ReturnError(terminator, newFrom.Allocate(kNoAsyncObject))}; + if (stat == StatOk) { + if (HasDynamicComponent(from)) { + // If 'from' has allocatable/automatic component, we cannot + // just make a shallow copy of the descriptor member. + // This will still leave data overlap in 'to' and 'newFrom'. + // For example: + // type t + // character, allocatable :: c(:) + // end type t + // type(t) :: x(3) + // x(2:3) = x(1:2) + // We have to make a deep copy into 'newFrom' in this case. + RTNAME(AssignTemporary) + (newFrom, from, terminator.sourceFileName(), terminator.sourceLine()); + } else { + ShallowCopy(newFrom, from, true, from.IsContiguous()); } - } else { - ShallowCopy(newFrom, *from_, true, from_->IsContiguous()); + Assign(to, newFrom, terminator, + flags & + (NeedFinalization | ComponentCanBeDefinedAssignment | + ExplicitLengthCharacterLHS | CanBeDefinedAssignment)); + newFrom.Deallocate(); } - from_ = &newFrom; - flags_ &= NeedFinalization | ComponentCanBeDefinedAssignment | - ExplicitLengthCharacterLHS | CanBeDefinedAssignment; - toDeallocate_ = &newFrom; + return; } } - if (to_.IsAllocatable()) { + if (to.IsAllocatable()) { if (mustDeallocateLHS) { - if (!toDeallocate_ && to_.IsAllocated()) { - toDeallocate_ = &to_; + if (deferDeallocation) { + if ((flags & NeedFinalization) && toDerived) { + Finalize(*deferDeallocation, *toDerived, &terminator); + flags &= ~NeedFinalization; + } + } else { + to.Destroy((flags & NeedFinalization) != 0, /*destroyPointers=*/false, + &terminator); + flags &= ~NeedFinalization; } - } else if (to_.rank() != from_->rank() && !to_.IsAllocated()) { - workQueue.terminator().Crash("Assign: mismatched ranks (%d != %d) in " - "assignment to unallocated allocatable", - to_.rank(), from_->rank()); + } else if (to.rank() != from.rank() && !to.IsAllocated()) { + terminator.Crash("Assign: mismatched ranks (%d != %d) in assignment to " + "unallocated allocatable", + to.rank(), from.rank()); } - } else if (!to_.IsAllocated()) { - workQueue.terminator().Crash( - "Assign: left-hand side variable is neither allocated nor allocatable"); - } - if (toDerived_ && to_.IsAllocated()) { - // Schedule finalization or destruction of the LHS. - if (flags_ & NeedFinalization) { - if (int status{workQueue.BeginFinalize(to_, *toDerived_)}; - status != StatOk && status != StatContinue) { - return status; - } - } else if (!toDerived_->noDestructionNeeded()) { - if (int status{ - workQueue.BeginDestroy(to_, *toDerived_, /*finalize=*/false)}; - status != StatOk && status != StatContinue) { - return status; + if (!to.IsAllocated()) { + if (AllocateAssignmentLHS(to, from, terminator, flags) != StatOk) { + return; } + flags &= ~NeedFinalization; + toElementBytes = to.ElementBytes(); // may have changed + toDerived = toAddendum ? toAddendum->derivedType() : nullptr; } } - return StatContinue; -} - -RT_API_ATTRS int AssignTicket::Continue(WorkQueue &workQueue) { - if (done_) { - // All child tickets are complete; can release this ticket's state. - if (toDeallocate_) { - toDeallocate_->Deallocate(); - } - return StatOk; - } - // All necessary finalization or destruction that was initiated by Begin() - // has been completed. Deallocation may be pending, and if it's for the LHS, - // do it now so that the LHS gets reallocated. - if (toDeallocate_ == &to_) { - toDeallocate_ = nullptr; - to_.Deallocate(); - } - // Allocate the LHS if needed - if (!to_.IsAllocated()) { - if (int stat{ - AllocateAssignmentLHS(to_, *from_, workQueue.terminator(), flags_)}; - stat != StatOk) { - return stat; - } - const auto *addendum{to_.Addendum()}; - toDerived_ = addendum ? addendum->derivedType() : nullptr; - if (toDerived_ && !toDerived_->noInitializationNeeded()) { - if (int status{workQueue.BeginInitialize(to_, *toDerived_)}; - status != StatOk) { - return status; - } - } - } - // Check for a user-defined assignment type-bound procedure; - // see 10.2.1.4-5. - // Note that the aliasing and LHS (re)allocation handling above - // needs to run even with CanBeDefinedAssignment flag, since - // Assign() can be invoked recursively for component-wise assignments. - if (toDerived_ && (flags_ & CanBeDefinedAssignment)) { - if (to_.rank() == 0) { - if (const auto *special{toDerived_->FindSpecialBinding( + if (toDerived && (flags & CanBeDefinedAssignment)) { + // Check for a user-defined assignment type-bound procedure; + // see 10.2.1.4-5. A user-defined assignment TBP defines all of + // the semantics, including allocatable (re)allocation and any + // finalization. + // + // Note that the aliasing and LHS (re)allocation handling above + // needs to run even with CanBeDefinedAssignment flag, when + // the Assign() is invoked recursively for component-per-component + // assignments. + if (to.rank() == 0) { + if (const auto *special{toDerived->FindSpecialBinding( typeInfo::SpecialBinding::Which::ScalarAssignment)}) { - DoScalarDefinedAssignment(to_, *from_, *special); - done_ = true; - return StatContinue; + return DoScalarDefinedAssignment(to, from, *special); } } - if (const auto *special{toDerived_->FindSpecialBinding( + if (const auto *special{toDerived->FindSpecialBinding( typeInfo::SpecialBinding::Which::ElementalAssignment)}) { - DoElementalDefinedAssignment(to_, *from_, *toDerived_, *special); - done_ = true; - return StatContinue; + return DoElementalDefinedAssignment(to, from, *toDerived, *special); } } - // Intrinsic assignment - std::size_t toElements{to_.Elements()}; - if (from_->rank() > 0 && toElements != from_->Elements()) { - workQueue.terminator().Crash("Assign: mismatching element counts in array " - "assignment (to %zd, from %zd)", - toElements, from_->Elements()); + SubscriptValue toAt[maxRank]; + to.GetLowerBounds(toAt); + // Scalar expansion of the RHS is implied by using the same empty + // subscript values on each (seemingly) elemental reference into + // "from". + SubscriptValue fromAt[maxRank]; + from.GetLowerBounds(fromAt); + std::size_t toElements{to.Elements()}; + if (from.rank() > 0 && toElements != from.Elements()) { + terminator.Crash("Assign: mismatching element counts in array assignment " + "(to %zd, from %zd)", + toElements, from.Elements()); } - if (to_.type() != from_->type()) { - workQueue.terminator().Crash( - "Assign: mismatching types (to code %d != from code %d)", - to_.type().raw(), from_->type().raw()); + if (to.type() != from.type()) { + terminator.Crash("Assign: mismatching types (to code %d != from code %d)", + to.type().raw(), from.type().raw()); } - std::size_t toElementBytes{to_.ElementBytes()}; - std::size_t fromElementBytes{from_->ElementBytes()}; - if (toElementBytes > fromElementBytes && !to_.type().IsCharacter()) { - workQueue.terminator().Crash("Assign: mismatching non-character element " - "sizes (to %zd bytes != from %zd bytes)", + if (toElementBytes > fromElementBytes && !to.type().IsCharacter()) { + terminator.Crash("Assign: mismatching non-character element sizes (to %zd " + "bytes != from %zd bytes)", toElementBytes, fromElementBytes); } - if (toDerived_) { - if (toDerived_->noDefinedAssignment()) { // componentwise - if (int status{workQueue.BeginDerivedAssign( - to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)}; - status != StatOk && status != StatContinue) { - return status; + if (const typeInfo::DerivedType * + updatedToDerived{toAddendum ? toAddendum->derivedType() : nullptr}) { + // Derived type intrinsic assignment, which is componentwise and elementwise + // for all components, including parent components (10.2.1.2-3). + // The target is first finalized if still necessary (7.5.6.3(1)) + if (flags & NeedFinalization) { + Finalize(to, *updatedToDerived, &terminator); + } else if (updatedToDerived && !updatedToDerived->noDestructionNeeded()) { + Destroy(to, /*finalize=*/false, *updatedToDerived, &terminator); + } + // Copy the data components (incl. the parent) first. + const Descriptor &componentDesc{updatedToDerived->component()}; + std::size_t numComponents{componentDesc.Elements()}; + for (std::size_t j{0}; j < toElements; + ++j, to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) { + for (std::size_t k{0}; k < numComponents; ++k) { + const auto &comp{ + *componentDesc.ZeroBasedIndexedElement( + k)}; // TODO: exploit contiguity here + // Use PolymorphicLHS for components so that the right things happen + // when the components are polymorphic; when they're not, they're both + // not, and their declared types will match. + int nestedFlags{MaybeReallocate | PolymorphicLHS}; + if (flags & ComponentCanBeDefinedAssignment) { + nestedFlags |= + CanBeDefinedAssignment | ComponentCanBeDefinedAssignment; + } + switch (comp.genre()) { + case typeInfo::Component::Genre::Data: + if (comp.category() == TypeCategory::Derived) { + StaticDescriptor statDesc[2]; + Descriptor &toCompDesc{statDesc[0].descriptor()}; + Descriptor &fromCompDesc{statDesc[1].descriptor()}; + comp.CreatePointerDescriptor(toCompDesc, to, terminator, toAt); + comp.CreatePointerDescriptor( + fromCompDesc, from, terminator, fromAt); + Assign(toCompDesc, fromCompDesc, terminator, nestedFlags); + } else { // Component has intrinsic type; simply copy raw bytes + std::size_t componentByteSize{comp.SizeInBytes(to)}; + memmoveFct(to.Element(toAt) + comp.offset(), + from.Element(fromAt) + comp.offset(), + componentByteSize); + } + break; + case typeInfo::Component::Genre::Pointer: { + std::size_t componentByteSize{comp.SizeInBytes(to)}; + memmoveFct(to.Element(toAt) + comp.offset(), + from.Element(fromAt) + comp.offset(), + componentByteSize); + } break; + case typeInfo::Component::Genre::Allocatable: + case typeInfo::Component::Genre::Automatic: { + auto *toDesc{reinterpret_cast( + to.Element(toAt) + comp.offset())}; + const auto *fromDesc{reinterpret_cast( + from.Element(fromAt) + comp.offset())}; + // Allocatable components of the LHS are unconditionally + // deallocated before assignment (F'2018 10.2.1.3(13)(1)), + // unlike a "top-level" assignment to a variable, where + // deallocation is optional. + // + // Be careful not to destroy/reallocate the LHS, if there is + // overlap between LHS and RHS (it seems that partial overlap + // is not possible, though). + // Invoke Assign() recursively to deal with potential aliasing. + if (toDesc->IsAllocatable()) { + if (!fromDesc->IsAllocated()) { + // No aliasing. + // + // If to is not allocated, the Destroy() call is a no-op. + // This is just a shortcut, because the recursive Assign() + // below would initiate the destruction for to. + // No finalization is required. + toDesc->Destroy( + /*finalize=*/false, /*destroyPointers=*/false, &terminator); + continue; // F'2018 10.2.1.3(13)(2) + } + } + // Force LHS deallocation with DeallocateLHS flag. + // The actual deallocation may be avoided, if the existing + // location can be reoccupied. + Assign(*toDesc, *fromDesc, terminator, nestedFlags | DeallocateLHS); + } break; + } } - } else { // elementwise - if (int status{workQueue.BeginDerivedAssign( - to_, *from_, *toDerived_, flags_, memmoveFct_, toDeallocate_)}; - status != StatOk && status != StatContinue) { - return status; + // Copy procedure pointer components + const Descriptor &procPtrDesc{updatedToDerived->procPtr()}; + std::size_t numProcPtrs{procPtrDesc.Elements()}; + for (std::size_t k{0}; k < numProcPtrs; ++k) { + const auto &procPtr{ + *procPtrDesc.ZeroBasedIndexedElement( + k)}; + memmoveFct(to.Element(toAt) + procPtr.offset, + from.Element(fromAt) + procPtr.offset, + sizeof(typeInfo::ProcedurePointer)); } } - toDeallocate_ = nullptr; - } else if (IsSimpleMemmove()) { - memmoveFct_(to_.raw().base_addr, from_->raw().base_addr, - toElements * toElementBytes); - } else { - // Scalar expansion of the RHS is implied by using the same empty - // subscript values on each (seemingly) elemental reference into - // "from". - SubscriptValue toAt[maxRank]; - to_.GetLowerBounds(toAt); - SubscriptValue fromAt[maxRank]; - from_->GetLowerBounds(fromAt); - if (toElementBytes > fromElementBytes) { // blank padding - switch (to_.type().raw()) { + } else { // intrinsic type, intrinsic assignment + if (isSimpleMemmove()) { + memmoveFct(to.raw().base_addr, from.raw().base_addr, + toElements * toElementBytes); + } else if (toElementBytes > fromElementBytes) { // blank padding + switch (to.type().raw()) { case CFI_type_signed_char: case CFI_type_char: - BlankPadCharacterAssignment(to_, *from_, toAt, fromAt, toElements, + BlankPadCharacterAssignment(to, from, toAt, fromAt, toElements, toElementBytes, fromElementBytes); break; case CFI_type_char16_t: - BlankPadCharacterAssignment(to_, *from_, toAt, fromAt, + BlankPadCharacterAssignment(to, from, toAt, fromAt, toElements, toElementBytes, fromElementBytes); break; case CFI_type_char32_t: - BlankPadCharacterAssignment(to_, *from_, toAt, fromAt, + BlankPadCharacterAssignment(to, from, toAt, fromAt, toElements, toElementBytes, fromElementBytes); break; default: - workQueue.terminator().Crash( - "unexpected type code %d in blank padded Assign()", - to_.type().raw()); + terminator.Crash("unexpected type code %d in blank padded Assign()", + to.type().raw()); } } else { // elemental copies, possibly with character truncation for (std::size_t n{toElements}; n-- > 0; - to_.IncrementSubscripts(toAt), from_->IncrementSubscripts(fromAt)) { - memmoveFct_(to_.Element(toAt), from_->Element(fromAt), + to.IncrementSubscripts(toAt), from.IncrementSubscripts(fromAt)) { + memmoveFct(to.Element(toAt), from.Element(fromAt), toElementBytes); } } } - if (persist_) { - done_ = true; - return StatContinue; - } else { - if (toDeallocate_) { - toDeallocate_->Deallocate(); - toDeallocate_ = nullptr; - } - return StatOk; + if (deferDeallocation) { + // deferDeallocation is used only when LHS is an allocatable. + // The finalization has already been run for it. + deferDeallocation->Destroy( + /*finalize=*/false, /*destroyPointers=*/false, &terminator); } } -template -RT_API_ATTRS int DerivedAssignTicket::Begin( - WorkQueue &workQueue) { - if (toIsContiguous_ && fromIsContiguous_ && - this->derived_.noDestructionNeeded() && - this->derived_.noDefinedAssignment() && - this->instance_.rank() == this->from_->rank()) { - if (std::size_t elementBytes{this->instance_.ElementBytes()}; - elementBytes == this->from_->ElementBytes()) { - // Fastest path. Both LHS and RHS are contiguous, RHS is not a scalar - // to be expanded, the types have the same size, and there are no - // allocatable components or defined ASSIGNMENT(=) at any level. - memmoveFct_(this->instance_.template OffsetElement(), - this->from_->template OffsetElement(), - this->instance_.Elements() * elementBytes); - return StatOk; - } - } - // Use PolymorphicLHS for components so that the right things happen - // when the components are polymorphic; when they're not, they're both - // not, and their declared types will match. - int nestedFlags{MaybeReallocate | PolymorphicLHS}; - if (flags_ & ComponentCanBeDefinedAssignment) { - nestedFlags |= CanBeDefinedAssignment | ComponentCanBeDefinedAssignment; - } - flags_ = nestedFlags; - // Copy procedure pointer components - const Descriptor &procPtrDesc{this->derived_.procPtr()}; - bool noDataComponents{this->IsComplete()}; - if (std::size_t numProcPtrs{procPtrDesc.Elements()}) { - for (std::size_t k{0}; k < numProcPtrs; ++k) { - const auto &procPtr{ - *procPtrDesc.ZeroBasedIndexedElement(k)}; - // Loop only over elements - if (noDataComponents) { - Elementwise::Reset(); - } - for (; !Elementwise::IsComplete(); Elementwise::Advance()) { - memmoveFct_(this->instance_.template ElementComponent( - this->subscripts_, procPtr.offset), - this->from_->template ElementComponent( - this->fromSubscripts_, procPtr.offset), - sizeof(typeInfo::ProcedurePointer)); - } - } - if (noDataComponents) { - return StatOk; - } - Elementwise::Reset(); - } - if (noDataComponents) { - return StatOk; - } - return StatContinue; -} -template RT_API_ATTRS int DerivedAssignTicket::Begin(WorkQueue &); -template RT_API_ATTRS int DerivedAssignTicket::Begin(WorkQueue &); - -template -RT_API_ATTRS int DerivedAssignTicket::Continue( - WorkQueue &workQueue) { - while (!this->IsComplete()) { - // Copy the data components (incl. the parent) first. - switch (this->component_->genre()) { - case typeInfo::Component::Genre::Data: - if (this->component_->category() == TypeCategory::Derived) { - Descriptor &toCompDesc{this->componentDescriptor_.descriptor()}; - Descriptor &fromCompDesc{this->fromComponentDescriptor_.descriptor()}; - this->component_->CreatePointerDescriptor(toCompDesc, this->instance_, - workQueue.terminator(), this->subscripts_); - this->component_->CreatePointerDescriptor(fromCompDesc, *this->from_, - workQueue.terminator(), this->fromSubscripts_); - this->Advance(); - if (int status{workQueue.BeginAssign( - toCompDesc, fromCompDesc, flags_, memmoveFct_)}; - status != StatOk) { - return status; - } - } else { // Component has intrinsic type; simply copy raw bytes - std::size_t componentByteSize{ - this->component_->SizeInBytes(this->instance_)}; - if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) { - std::size_t offset{this->component_->offset()}; - char *to{this->instance_.template OffsetElement(offset)}; - const char *from{ - this->from_->template OffsetElement(offset)}; - std::size_t toElementStride{this->instance_.ElementBytes()}; - std::size_t fromElementStride{ - this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()}; - if (toElementStride == fromElementStride && - toElementStride == componentByteSize) { - memmoveFct_(to, from, this->elements_ * componentByteSize); - } else { - for (std::size_t n{this->elements_}; n--; - to += toElementStride, from += fromElementStride) { - memmoveFct_(to, from, componentByteSize); - } - } - this->Componentwise::Advance(); - } else { - memmoveFct_( - this->instance_.template Element(this->subscripts_) + - this->component_->offset(), - this->from_->template Element(this->fromSubscripts_) + - this->component_->offset(), - componentByteSize); - this->Advance(); - } - } - break; - case typeInfo::Component::Genre::Pointer: { - std::size_t componentByteSize{ - this->component_->SizeInBytes(this->instance_)}; - if (IS_COMPONENTWISE && toIsContiguous_ && fromIsContiguous_) { - std::size_t offset{this->component_->offset()}; - char *to{this->instance_.template OffsetElement(offset)}; - const char *from{ - this->from_->template OffsetElement(offset)}; - std::size_t toElementStride{this->instance_.ElementBytes()}; - std::size_t fromElementStride{ - this->from_->rank() == 0 ? 0 : this->from_->ElementBytes()}; - if (toElementStride == fromElementStride && - toElementStride == componentByteSize) { - memmoveFct_(to, from, this->elements_ * componentByteSize); - } else { - for (std::size_t n{this->elements_}; n--; - to += toElementStride, from += fromElementStride) { - memmoveFct_(to, from, componentByteSize); - } - } - this->Componentwise::Advance(); - } else { - memmoveFct_(this->instance_.template Element(this->subscripts_) + - this->component_->offset(), - this->from_->template Element(this->fromSubscripts_) + - this->component_->offset(), - componentByteSize); - this->Advance(); - } - } break; - case typeInfo::Component::Genre::Allocatable: - case typeInfo::Component::Genre::Automatic: { - auto *toDesc{reinterpret_cast( - this->instance_.template Element(this->subscripts_) + - this->component_->offset())}; - const auto *fromDesc{reinterpret_cast( - this->from_->template Element(this->fromSubscripts_) + - this->component_->offset())}; - if (toDesc->IsAllocatable() && !fromDesc->IsAllocated()) { - if (toDesc->IsAllocated()) { - if (this->phase_ == 0) { - this->phase_++; - if (const auto *componentDerived{this->component_->derivedType()}; - componentDerived && !componentDerived->noDestructionNeeded()) { - if (int status{workQueue.BeginDestroy( - *toDesc, *componentDerived, /*finalize=*/false)}; - status != StatOk) { - return status; - } - } - } - toDesc->Deallocate(); - } - this->Advance(); - } else { - // Allocatable components of the LHS are unconditionally - // deallocated before assignment (F'2018 10.2.1.3(13)(1)), - // unlike a "top-level" assignment to a variable, where - // deallocation is optional. - this->Advance(); - int nestedFlags{flags_}; - if (this->derived_.noFinalizationNeeded() && - this->derived_.noInitializationNeeded() && - this->derived_.noDestructionNeeded()) { - // The actual deallocation may be avoided, if the existing - // location can be reoccupied. - } else { - // Force LHS deallocation with DeallocateLHS flag. - nestedFlags |= DeallocateLHS; - } - if (int status{workQueue.BeginAssign( - *toDesc, *fromDesc, nestedFlags, memmoveFct_)}; - status != StatOk) { - return status; - } - } - } break; - } - } - if (deallocateAfter_) { - deallocateAfter_->Deallocate(); - } - return StatOk; -} -template RT_API_ATTRS int DerivedAssignTicket::Continue(WorkQueue &); -template RT_API_ATTRS int DerivedAssignTicket::Continue(WorkQueue &); +RT_OFFLOAD_API_GROUP_BEGIN RT_API_ATTRS void DoFromSourceAssign(Descriptor &alloc, const Descriptor &source, Terminator &terminator, MemmoveFct memmoveFct) { @@ -759,6 +582,7 @@ void RTDEF(AssignTemporary)(Descriptor &to, const Descriptor &from, } } } + Assign(to, from, terminator, MaybeReallocate | PolymorphicLHS); } @@ -775,6 +599,7 @@ void RTDEF(CopyInAssign)(Descriptor &temp, const Descriptor &var, void RTDEF(CopyOutAssign)( Descriptor *var, Descriptor &temp, const char *sourceFile, int sourceLine) { Terminator terminator{sourceFile, sourceLine}; + // Copyout from the temporary must not cause any finalizations // for LHS. The variable must be properly initialized already. if (var) { diff --git a/flang-rt/lib/runtime/derived.cpp b/flang-rt/lib/runtime/derived.cpp index 8ab737c701b01..35037036f63e7 100644 --- a/flang-rt/lib/runtime/derived.cpp +++ b/flang-rt/lib/runtime/derived.cpp @@ -12,7 +12,6 @@ #include "flang-rt/runtime/terminator.h" #include "flang-rt/runtime/tools.h" #include "flang-rt/runtime/type-info.h" -#include "flang-rt/runtime/work-queue.h" namespace Fortran::runtime { @@ -31,193 +30,180 @@ static RT_API_ATTRS void GetComponentExtents(SubscriptValue (&extents)[maxRank], } RT_API_ATTRS int Initialize(const Descriptor &instance, - const typeInfo::DerivedType &derived, Terminator &terminator, bool, - const Descriptor *) { - WorkQueue workQueue{terminator}; - int status{workQueue.BeginInitialize(instance, derived)}; - return status == StatContinue ? workQueue.Run() : status; -} - -RT_API_ATTRS int InitializeTicket::Begin(WorkQueue &) { - // Initialize procedure pointer components in each element - const Descriptor &procPtrDesc{derived_.procPtr()}; - if (std::size_t numProcPtrs{procPtrDesc.Elements()}) { - bool noDataComponents{IsComplete()}; - for (std::size_t k{0}; k < numProcPtrs; ++k) { - const auto &comp{ - *procPtrDesc.ZeroBasedIndexedElement(k)}; - // Loop only over elements - if (noDataComponents) { - Elementwise::Reset(); - } - for (; !Elementwise::IsComplete(); Elementwise::Advance()) { - auto &pptr{*instance_.ElementComponent( - subscripts_, comp.offset)}; - pptr = comp.procInitialization; - } - } - if (noDataComponents) { - return StatOk; - } - Elementwise::Reset(); - } - return StatContinue; -} - -RT_API_ATTRS int InitializeTicket::Continue(WorkQueue &workQueue) { - while (!IsComplete()) { - if (component_->genre() == typeInfo::Component::Genre::Allocatable) { - // Establish allocatable descriptors - for (; !Elementwise::IsComplete(); Elementwise::Advance()) { - Descriptor &allocDesc{*instance_.ElementComponent( - subscripts_, component_->offset())}; - component_->EstablishDescriptor( - allocDesc, instance_, workQueue.terminator()); + const typeInfo::DerivedType &derived, Terminator &terminator, bool hasStat, + const Descriptor *errMsg) { + const Descriptor &componentDesc{derived.component()}; + std::size_t elements{instance.Elements()}; + int stat{StatOk}; + // Initialize data components in each element; the per-element iterations + // constitute the inner loops, not the outer ones + std::size_t myComponents{componentDesc.Elements()}; + for (std::size_t k{0}; k < myComponents; ++k) { + const auto &comp{ + *componentDesc.ZeroBasedIndexedElement(k)}; + SubscriptValue at[maxRank]; + instance.GetLowerBounds(at); + if (comp.genre() == typeInfo::Component::Genre::Allocatable || + comp.genre() == typeInfo::Component::Genre::Automatic) { + for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) { + Descriptor &allocDesc{ + *instance.ElementComponent(at, comp.offset())}; + comp.EstablishDescriptor(allocDesc, instance, terminator); allocDesc.raw().attribute = CFI_attribute_allocatable; + if (comp.genre() == typeInfo::Component::Genre::Automatic) { + stat = ReturnError( + terminator, allocDesc.Allocate(kNoAsyncObject), errMsg, hasStat); + if (stat == StatOk) { + if (const DescriptorAddendum * addendum{allocDesc.Addendum()}) { + if (const auto *derived{addendum->derivedType()}) { + if (!derived->noInitializationNeeded()) { + stat = Initialize( + allocDesc, *derived, terminator, hasStat, errMsg); + } + } + } + } + if (stat != StatOk) { + break; + } + } } - SkipToNextComponent(); - } else if (const void *init{component_->initialization()}) { + } else if (const void *init{comp.initialization()}) { // Explicit initialization of data pointers and // non-allocatable non-automatic components - std::size_t bytes{component_->SizeInBytes(instance_)}; - for (; !Elementwise::IsComplete(); Elementwise::Advance()) { - char *ptr{instance_.ElementComponent( - subscripts_, component_->offset())}; + std::size_t bytes{comp.SizeInBytes(instance)}; + for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) { + char *ptr{instance.ElementComponent(at, comp.offset())}; std::memcpy(ptr, init, bytes); } - SkipToNextComponent(); - } else if (component_->genre() == typeInfo::Component::Genre::Pointer) { + } else if (comp.genre() == typeInfo::Component::Genre::Pointer) { // Data pointers without explicit initialization are established // so that they are valid right-hand side targets of pointer // assignment statements. - for (; !Elementwise::IsComplete(); Elementwise::Advance()) { - Descriptor &ptrDesc{*instance_.ElementComponent( - subscripts_, component_->offset())}; - component_->EstablishDescriptor( - ptrDesc, instance_, workQueue.terminator()); + for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) { + Descriptor &ptrDesc{ + *instance.ElementComponent(at, comp.offset())}; + comp.EstablishDescriptor(ptrDesc, instance, terminator); ptrDesc.raw().attribute = CFI_attribute_pointer; } - SkipToNextComponent(); - } else if (component_->genre() == typeInfo::Component::Genre::Data && - component_->derivedType() && - !component_->derivedType()->noInitializationNeeded()) { + } else if (comp.genre() == typeInfo::Component::Genre::Data && + comp.derivedType() && !comp.derivedType()->noInitializationNeeded()) { // Default initialization of non-pointer non-allocatable/automatic - // data component. Handles parent component's elements. + // data component. Handles parent component's elements. Recursive. SubscriptValue extents[maxRank]; - GetComponentExtents(extents, *component_, instance_); - Descriptor &compDesc{componentDescriptor_.descriptor()}; - const typeInfo::DerivedType &compType{*component_->derivedType()}; - compDesc.Establish(compType, - instance_.ElementComponent(subscripts_, component_->offset()), - component_->rank(), extents); - Advance(); - if (int status{workQueue.BeginInitialize(compDesc, compType)}; - status != StatOk) { - return status; + GetComponentExtents(extents, comp, instance); + StaticDescriptor staticDescriptor; + Descriptor &compDesc{staticDescriptor.descriptor()}; + const typeInfo::DerivedType &compType{*comp.derivedType()}; + for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) { + compDesc.Establish(compType, + instance.ElementComponent(at, comp.offset()), comp.rank(), + extents); + stat = Initialize(compDesc, compType, terminator, hasStat, errMsg); + if (stat != StatOk) { + break; + } } - } else { - SkipToNextComponent(); } } - return StatOk; + // Initialize procedure pointer components in each element + const Descriptor &procPtrDesc{derived.procPtr()}; + std::size_t myProcPtrs{procPtrDesc.Elements()}; + for (std::size_t k{0}; k < myProcPtrs; ++k) { + const auto &comp{ + *procPtrDesc.ZeroBasedIndexedElement(k)}; + SubscriptValue at[maxRank]; + instance.GetLowerBounds(at); + for (std::size_t j{0}; j++ < elements; instance.IncrementSubscripts(at)) { + auto &pptr{*instance.ElementComponent( + at, comp.offset)}; + pptr = comp.procInitialization; + } + } + return stat; } RT_API_ATTRS int InitializeClone(const Descriptor &clone, - const Descriptor &original, const typeInfo::DerivedType &derived, + const Descriptor &orig, const typeInfo::DerivedType &derived, Terminator &terminator, bool hasStat, const Descriptor *errMsg) { - if (original.IsPointer() || !original.IsAllocated()) { - return StatOk; // nothing to do - } else { - WorkQueue workQueue{terminator}; - int status{workQueue.BeginInitializeClone( - clone, original, derived, hasStat, errMsg)}; - return status == StatContinue ? workQueue.Run() : status; - } -} + const Descriptor &componentDesc{derived.component()}; + std::size_t elements{orig.Elements()}; + int stat{StatOk}; -RT_API_ATTRS int InitializeCloneTicket::Continue(WorkQueue &workQueue) { - while (!IsComplete()) { - if (component_->genre() == typeInfo::Component::Genre::Allocatable) { - Descriptor &origDesc{*instance_.ElementComponent( - subscripts_, component_->offset())}; - if (origDesc.IsAllocated()) { - Descriptor &cloneDesc{*clone_.ElementComponent( - subscripts_, component_->offset())}; - if (phase_ == 0) { - ++phase_; + // Skip pointers and unallocated variables. + if (orig.IsPointer() || !orig.IsAllocated()) { + return stat; + } + // Initialize each data component. + std::size_t components{componentDesc.Elements()}; + for (std::size_t i{0}; i < components; ++i) { + const typeInfo::Component &comp{ + *componentDesc.ZeroBasedIndexedElement(i)}; + SubscriptValue at[maxRank]; + orig.GetLowerBounds(at); + // Allocate allocatable components that are also allocated in the original + // object. + if (comp.genre() == typeInfo::Component::Genre::Allocatable) { + // Initialize each element. + for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) { + Descriptor &origDesc{ + *orig.ElementComponent(at, comp.offset())}; + Descriptor &cloneDesc{ + *clone.ElementComponent(at, comp.offset())}; + if (origDesc.IsAllocated()) { cloneDesc.ApplyMold(origDesc, origDesc.rank()); - if (int stat{ReturnError(workQueue.terminator(), - cloneDesc.Allocate(kNoAsyncObject), errMsg_, hasStat_)}; - stat != StatOk) { - return stat; - } - if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) { - if (const typeInfo::DerivedType *derived{addendum->derivedType()}) { - if (!derived->noInitializationNeeded()) { - // Perform default initialization for the allocated element. - if (int status{workQueue.BeginInitialize(cloneDesc, *derived)}; - status != StatOk) { - return status; + stat = ReturnError( + terminator, cloneDesc.Allocate(kNoAsyncObject), errMsg, hasStat); + if (stat == StatOk) { + if (const DescriptorAddendum * addendum{cloneDesc.Addendum()}) { + if (const typeInfo::DerivedType * + derived{addendum->derivedType()}) { + if (!derived->noInitializationNeeded()) { + // Perform default initialization for the allocated element. + stat = Initialize( + cloneDesc, *derived, terminator, hasStat, errMsg); + } + // Initialize derived type's allocatables. + if (stat == StatOk) { + stat = InitializeClone(cloneDesc, origDesc, *derived, + terminator, hasStat, errMsg); } } } } } - if (phase_ == 1) { - ++phase_; - if (const DescriptorAddendum *addendum{cloneDesc.Addendum()}) { - if (const typeInfo::DerivedType *derived{addendum->derivedType()}) { - // Initialize derived type's allocatables. - if (int status{workQueue.BeginInitializeClone( - cloneDesc, origDesc, *derived, hasStat_, errMsg_)}; - status != StatOk) { - return status; - } - } - } + if (stat != StatOk) { + break; } } - Advance(); - } else if (component_->genre() == typeInfo::Component::Genre::Data) { - if (component_->derivedType()) { - // Handle nested derived types. - const typeInfo::DerivedType &compType{*component_->derivedType()}; - SubscriptValue extents[maxRank]; - GetComponentExtents(extents, *component_, instance_); - Descriptor &origDesc{componentDescriptor_.descriptor()}; - Descriptor &cloneDesc{cloneComponentDescriptor_.descriptor()}; + } else if (comp.genre() == typeInfo::Component::Genre::Data && + comp.derivedType()) { + // Handle nested derived types. + const typeInfo::DerivedType &compType{*comp.derivedType()}; + SubscriptValue extents[maxRank]; + GetComponentExtents(extents, comp, orig); + // Data components don't have descriptors, allocate them. + StaticDescriptor origStaticDesc; + StaticDescriptor cloneStaticDesc; + Descriptor &origDesc{origStaticDesc.descriptor()}; + Descriptor &cloneDesc{cloneStaticDesc.descriptor()}; + // Initialize each element. + for (std::size_t j{0}; j < elements; ++j, orig.IncrementSubscripts(at)) { origDesc.Establish(compType, - instance_.ElementComponent(subscripts_, component_->offset()), - component_->rank(), extents); + orig.ElementComponent(at, comp.offset()), comp.rank(), + extents); cloneDesc.Establish(compType, - clone_.ElementComponent(subscripts_, component_->offset()), - component_->rank(), extents); - Advance(); - if (int status{workQueue.BeginInitializeClone( - cloneDesc, origDesc, compType, hasStat_, errMsg_)}; - status != StatOk) { - return status; + clone.ElementComponent(at, comp.offset()), comp.rank(), + extents); + stat = InitializeClone( + cloneDesc, origDesc, compType, terminator, hasStat, errMsg); + if (stat != StatOk) { + break; } - } else { - SkipToNextComponent(); } - } else { - SkipToNextComponent(); - } - } - return StatOk; -} - -// Fortran 2018 subclause 7.5.6.2 -RT_API_ATTRS void Finalize(const Descriptor &descriptor, - const typeInfo::DerivedType &derived, Terminator *terminator) { - if (!derived.noFinalizationNeeded() && descriptor.IsAllocated()) { - Terminator stubTerminator{"Finalize() in Fortran runtime", 0}; - WorkQueue workQueue{terminator ? *terminator : stubTerminator}; - if (workQueue.BeginFinalize(descriptor, derived) == StatContinue) { - workQueue.Run(); } } + return stat; } static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal( @@ -235,7 +221,7 @@ static RT_API_ATTRS const typeInfo::SpecialBinding *FindFinal( } static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor, - const typeInfo::DerivedType &derived, Terminator &terminator) { + const typeInfo::DerivedType &derived, Terminator *terminator) { if (const auto *special{FindFinal(derived, descriptor.rank())}) { if (special->which() == typeInfo::SpecialBinding::Which::ElementalFinal) { std::size_t elements{descriptor.Elements()}; @@ -272,7 +258,9 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor, copy = descriptor; copy.set_base_addr(nullptr); copy.raw().attribute = CFI_attribute_allocatable; - RUNTIME_CHECK(terminator, copy.Allocate(kNoAsyncObject) == CFI_SUCCESS); + Terminator stubTerminator{"CallFinalProcedure() in Fortran runtime", 0}; + RUNTIME_CHECK(terminator ? *terminator : stubTerminator, + copy.Allocate(kNoAsyncObject) == CFI_SUCCESS); ShallowCopyDiscontiguousToContiguous(copy, descriptor); argDescriptor = © } @@ -296,94 +284,87 @@ static RT_API_ATTRS void CallFinalSubroutine(const Descriptor &descriptor, } } -RT_API_ATTRS int FinalizeTicket::Begin(WorkQueue &workQueue) { - CallFinalSubroutine(instance_, derived_, workQueue.terminator()); +// Fortran 2018 subclause 7.5.6.2 +RT_API_ATTRS void Finalize(const Descriptor &descriptor, + const typeInfo::DerivedType &derived, Terminator *terminator) { + if (derived.noFinalizationNeeded() || !descriptor.IsAllocated()) { + return; + } + CallFinalSubroutine(descriptor, derived, terminator); + const auto *parentType{derived.GetParentType()}; + bool recurse{parentType && !parentType->noFinalizationNeeded()}; // If there's a finalizable parent component, handle it last, as required // by the Fortran standard (7.5.6.2), and do so recursively with the same // descriptor so that the rank is preserved. - finalizableParentType_ = derived_.GetParentType(); - if (finalizableParentType_) { - if (finalizableParentType_->noFinalizationNeeded()) { - finalizableParentType_ = nullptr; - } else { - SkipToNextComponent(); - } - } - return StatContinue; -} - -RT_API_ATTRS int FinalizeTicket::Continue(WorkQueue &workQueue) { - while (!IsComplete()) { - if (component_->genre() == typeInfo::Component::Genre::Allocatable && - component_->category() == TypeCategory::Derived) { + const Descriptor &componentDesc{derived.component()}; + std::size_t myComponents{componentDesc.Elements()}; + std::size_t elements{descriptor.Elements()}; + for (auto k{recurse ? std::size_t{1} + /* skip first component, it's the parent */ + : 0}; + k < myComponents; ++k) { + const auto &comp{ + *componentDesc.ZeroBasedIndexedElement(k)}; + SubscriptValue at[maxRank]; + descriptor.GetLowerBounds(at); + if (comp.genre() == typeInfo::Component::Genre::Allocatable && + comp.category() == TypeCategory::Derived) { // Component may be polymorphic or unlimited polymorphic. Need to use the // dynamic type to check whether finalization is needed. - const Descriptor &compDesc{*instance_.ElementComponent( - subscripts_, component_->offset())}; - Advance(); - if (compDesc.IsAllocated()) { - if (const DescriptorAddendum *addendum{compDesc.Addendum()}) { - if (const typeInfo::DerivedType *compDynamicType{ - addendum->derivedType()}) { - if (!compDynamicType->noFinalizationNeeded()) { - if (int status{ - workQueue.BeginFinalize(compDesc, *compDynamicType)}; - status != StatOk) { - return status; + for (std::size_t j{0}; j++ < elements; + descriptor.IncrementSubscripts(at)) { + const Descriptor &compDesc{ + *descriptor.ElementComponent(at, comp.offset())}; + if (compDesc.IsAllocated()) { + if (const DescriptorAddendum * addendum{compDesc.Addendum()}) { + if (const typeInfo::DerivedType * + compDynamicType{addendum->derivedType()}) { + if (!compDynamicType->noFinalizationNeeded()) { + Finalize(compDesc, *compDynamicType, terminator); } } } } } - } else if (component_->genre() == typeInfo::Component::Genre::Allocatable || - component_->genre() == typeInfo::Component::Genre::Automatic) { - if (const typeInfo::DerivedType *compType{component_->derivedType()}; - compType && !compType->noFinalizationNeeded()) { - const Descriptor &compDesc{*instance_.ElementComponent( - subscripts_, component_->offset())}; - Advance(); - if (compDesc.IsAllocated()) { - if (int status{workQueue.BeginFinalize(compDesc, *compType)}; - status != StatOk) { - return status; + } else if (comp.genre() == typeInfo::Component::Genre::Allocatable || + comp.genre() == typeInfo::Component::Genre::Automatic) { + if (const typeInfo::DerivedType * compType{comp.derivedType()}) { + if (!compType->noFinalizationNeeded()) { + for (std::size_t j{0}; j++ < elements; + descriptor.IncrementSubscripts(at)) { + const Descriptor &compDesc{ + *descriptor.ElementComponent(at, comp.offset())}; + if (compDesc.IsAllocated()) { + Finalize(compDesc, *compType, terminator); + } } } - } else { - SkipToNextComponent(); } - } else if (component_->genre() == typeInfo::Component::Genre::Data && - component_->derivedType() && - !component_->derivedType()->noFinalizationNeeded()) { + } else if (comp.genre() == typeInfo::Component::Genre::Data && + comp.derivedType() && !comp.derivedType()->noFinalizationNeeded()) { SubscriptValue extents[maxRank]; - GetComponentExtents(extents, *component_, instance_); - Descriptor &compDesc{componentDescriptor_.descriptor()}; - const typeInfo::DerivedType &compType{*component_->derivedType()}; - compDesc.Establish(compType, - instance_.ElementComponent(subscripts_, component_->offset()), - component_->rank(), extents); - Advance(); - if (int status{workQueue.BeginFinalize(compDesc, compType)}; - status != StatOk) { - return status; + GetComponentExtents(extents, comp, descriptor); + StaticDescriptor staticDescriptor; + Descriptor &compDesc{staticDescriptor.descriptor()}; + const typeInfo::DerivedType &compType{*comp.derivedType()}; + for (std::size_t j{0}; j++ < elements; + descriptor.IncrementSubscripts(at)) { + compDesc.Establish(compType, + descriptor.ElementComponent(at, comp.offset()), comp.rank(), + extents); + Finalize(compDesc, compType, terminator); } - } else { - SkipToNextComponent(); } } - // Last, do the parent component, if any and finalizable. - if (finalizableParentType_) { - Descriptor &tmpDesc{componentDescriptor_.descriptor()}; - tmpDesc = instance_; + if (recurse) { + StaticDescriptor statDesc; + Descriptor &tmpDesc{statDesc.descriptor()}; + tmpDesc = descriptor; tmpDesc.raw().attribute = CFI_attribute_pointer; - tmpDesc.Addendum()->set_derivedType(finalizableParentType_); - tmpDesc.raw().elem_len = finalizableParentType_->sizeInBytes(); - const auto &parentType{*finalizableParentType_}; - finalizableParentType_ = nullptr; - // Don't return StatOk here if the nested FInalize is still running; - // it needs this->componentDescriptor_. - return workQueue.BeginFinalize(tmpDesc, parentType); + tmpDesc.Addendum()->set_derivedType(parentType); + tmpDesc.raw().elem_len = parentType->sizeInBytes(); + Finalize(tmpDesc, *parentType, terminator); } - return StatOk; } // The order of finalization follows Fortran 2018 7.5.6.2, with @@ -392,71 +373,51 @@ RT_API_ATTRS int FinalizeTicket::Continue(WorkQueue &workQueue) { // preceding any deallocation. RT_API_ATTRS void Destroy(const Descriptor &descriptor, bool finalize, const typeInfo::DerivedType &derived, Terminator *terminator) { - if (!derived.noFinalizationNeeded() && descriptor.IsAllocated()) { - Terminator stubTerminator{"Destroy() in Fortran runtime", 0}; - WorkQueue workQueue{terminator ? *terminator : stubTerminator}; - if (workQueue.BeginDestroy(descriptor, derived, finalize) == StatContinue) { - workQueue.Run(); - } + if (derived.noDestructionNeeded() || !descriptor.IsAllocated()) { + return; } -} - -RT_API_ATTRS int DestroyTicket::Begin(WorkQueue &workQueue) { - if (finalize_ && !derived_.noFinalizationNeeded()) { - if (int status{workQueue.BeginFinalize(instance_, derived_)}; - status != StatOk && status != StatContinue) { - return status; - } + if (finalize && !derived.noFinalizationNeeded()) { + Finalize(descriptor, derived, terminator); } - return StatContinue; -} - -RT_API_ATTRS int DestroyTicket::Continue(WorkQueue &workQueue) { // Deallocate all direct and indirect allocatable and automatic components. // Contrary to finalization, the order of deallocation does not matter. - while (!IsComplete()) { - const auto *componentDerived{component_->derivedType()}; - if (component_->genre() == typeInfo::Component::Genre::Allocatable || - component_->genre() == typeInfo::Component::Genre::Automatic) { - Descriptor *d{instance_.ElementComponent( - subscripts_, component_->offset())}; - if (d->IsAllocated()) { - if (phase_ == 0) { - ++phase_; - if (componentDerived && !componentDerived->noDestructionNeeded()) { - if (int status{workQueue.BeginDestroy( - *d, *componentDerived, /*finalize=*/false)}; - status != StatOk) { - return status; - } - } + const Descriptor &componentDesc{derived.component()}; + std::size_t myComponents{componentDesc.Elements()}; + std::size_t elements{descriptor.Elements()}; + SubscriptValue at[maxRank]; + descriptor.GetLowerBounds(at); + for (std::size_t k{0}; k < myComponents; ++k) { + const auto &comp{ + *componentDesc.ZeroBasedIndexedElement(k)}; + const bool destroyComp{ + comp.derivedType() && !comp.derivedType()->noDestructionNeeded()}; + if (comp.genre() == typeInfo::Component::Genre::Allocatable || + comp.genre() == typeInfo::Component::Genre::Automatic) { + for (std::size_t j{0}; j < elements; ++j) { + Descriptor *d{ + descriptor.ElementComponent(at, comp.offset())}; + if (destroyComp) { + Destroy(*d, /*finalize=*/false, *comp.derivedType(), terminator); } d->Deallocate(); + descriptor.IncrementSubscripts(at); } - Advance(); - } else if (component_->genre() == typeInfo::Component::Genre::Data) { - if (!componentDerived || componentDerived->noDestructionNeeded()) { - SkipToNextComponent(); - } else { - SubscriptValue extents[maxRank]; - GetComponentExtents(extents, *component_, instance_); - Descriptor &compDesc{componentDescriptor_.descriptor()}; - const typeInfo::DerivedType &compType{*componentDerived}; + } else if (destroyComp && + comp.genre() == typeInfo::Component::Genre::Data) { + SubscriptValue extents[maxRank]; + GetComponentExtents(extents, comp, descriptor); + StaticDescriptor staticDescriptor; + Descriptor &compDesc{staticDescriptor.descriptor()}; + const typeInfo::DerivedType &compType{*comp.derivedType()}; + for (std::size_t j{0}; j++ < elements; + descriptor.IncrementSubscripts(at)) { compDesc.Establish(compType, - instance_.ElementComponent(subscripts_, component_->offset()), - component_->rank(), extents); - Advance(); - if (int status{workQueue.BeginDestroy( - compDesc, *componentDerived, /*finalize=*/false)}; - status != StatOk) { - return status; - } + descriptor.ElementComponent(at, comp.offset()), comp.rank(), + extents); + Destroy(compDesc, /*finalize=*/false, *comp.derivedType(), terminator); } - } else { - SkipToNextComponent(); } } - return StatOk; } RT_API_ATTRS bool HasDynamicComponent(const Descriptor &descriptor) { diff --git a/flang-rt/lib/runtime/descriptor-io.cpp b/flang-rt/lib/runtime/descriptor-io.cpp index 364724b89ba0d..3db1455af52fe 100644 --- a/flang-rt/lib/runtime/descriptor-io.cpp +++ b/flang-rt/lib/runtime/descriptor-io.cpp @@ -7,44 +7,15 @@ //===----------------------------------------------------------------------===// #include "descriptor-io.h" -#include "edit-input.h" -#include "edit-output.h" -#include "unit.h" -#include "flang-rt/runtime/descriptor.h" -#include "flang-rt/runtime/io-stmt.h" -#include "flang-rt/runtime/namelist.h" -#include "flang-rt/runtime/terminator.h" -#include "flang-rt/runtime/type-info.h" -#include "flang-rt/runtime/work-queue.h" -#include "flang/Common/optional.h" #include "flang/Common/restorer.h" -#include "flang/Common/uint128.h" -#include "flang/Runtime/cpp-type.h" #include "flang/Runtime/freestanding-tools.h" -// Implementation of I/O data list item transfers based on descriptors. -// (All I/O items come through here so that the code is exercised for test; -// some scalar I/O data transfer APIs could be changed to bypass their use -// of descriptors in the future for better efficiency.) - namespace Fortran::runtime::io::descr { RT_OFFLOAD_API_GROUP_BEGIN -template -inline RT_API_ATTRS A &ExtractElement(IoStatementState &io, - const Descriptor &descriptor, const SubscriptValue subscripts[]) { - A *p{descriptor.Element(subscripts)}; - if (!p) { - io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base " - "address or subscripts out of range"); - } - return *p; -} - // Defined formatted I/O (maybe) -static RT_API_ATTRS Fortran::common::optional DefinedFormattedIo( - IoStatementState &io, const Descriptor &descriptor, - const typeInfo::DerivedType &derived, +Fortran::common::optional DefinedFormattedIo(IoStatementState &io, + const Descriptor &descriptor, const typeInfo::DerivedType &derived, const typeInfo::SpecialBinding &special, const SubscriptValue subscripts[]) { Fortran::common::optional peek{ @@ -133,8 +104,8 @@ static RT_API_ATTRS Fortran::common::optional DefinedFormattedIo( } // Defined unformatted I/O -static RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &io, - const Descriptor &descriptor, const typeInfo::DerivedType &derived, +bool DefinedUnformattedIo(IoStatementState &io, const Descriptor &descriptor, + const typeInfo::DerivedType &derived, const typeInfo::SpecialBinding &special) { // Unformatted I/O must have an external unit (or child thereof). IoErrorHandler &handler{io.GetIoErrorHandler()}; @@ -181,619 +152,5 @@ static RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &io, return handler.GetIoStat() == IostatOk; } -// Per-category descriptor-based I/O templates - -// TODO (perhaps as a nontrivial but small starter project): implement -// automatic repetition counts, like "10*3.14159", for list-directed and -// NAMELIST array output. - -template -inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io, - const Descriptor &descriptor, [[maybe_unused]] bool isSigned) { - std::size_t numElements{descriptor.Elements()}; - SubscriptValue subscripts[maxRank]; - descriptor.GetLowerBounds(subscripts); - using IntType = CppTypeFor; - bool anyInput{false}; - for (std::size_t j{0}; j < numElements; ++j) { - if (auto edit{io.GetNextDataEdit()}) { - IntType &x{ExtractElement(io, descriptor, subscripts)}; - if constexpr (DIR == Direction::Output) { - if (!EditIntegerOutput(io, *edit, x, isSigned)) { - return false; - } - } else if (edit->descriptor != DataEdit::ListDirectedNullValue) { - if (EditIntegerInput( - io, *edit, reinterpret_cast(&x), KIND, isSigned)) { - anyInput = true; - } else { - return anyInput && edit->IsNamelist(); - } - } - if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { - io.GetIoErrorHandler().Crash( - "FormattedIntegerIO: subscripts out of bounds"); - } - } else { - return false; - } - } - return true; -} - -template -inline RT_API_ATTRS bool FormattedRealIO( - IoStatementState &io, const Descriptor &descriptor) { - std::size_t numElements{descriptor.Elements()}; - SubscriptValue subscripts[maxRank]; - descriptor.GetLowerBounds(subscripts); - using RawType = typename RealOutputEditing::BinaryFloatingPoint; - bool anyInput{false}; - for (std::size_t j{0}; j < numElements; ++j) { - if (auto edit{io.GetNextDataEdit()}) { - RawType &x{ExtractElement(io, descriptor, subscripts)}; - if constexpr (DIR == Direction::Output) { - if (!RealOutputEditing{io, x}.Edit(*edit)) { - return false; - } - } else if (edit->descriptor != DataEdit::ListDirectedNullValue) { - if (EditRealInput(io, *edit, reinterpret_cast(&x))) { - anyInput = true; - } else { - return anyInput && edit->IsNamelist(); - } - } - if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { - io.GetIoErrorHandler().Crash( - "FormattedRealIO: subscripts out of bounds"); - } - } else { - return false; - } - } - return true; -} - -template -inline RT_API_ATTRS bool FormattedComplexIO( - IoStatementState &io, const Descriptor &descriptor) { - std::size_t numElements{descriptor.Elements()}; - SubscriptValue subscripts[maxRank]; - descriptor.GetLowerBounds(subscripts); - bool isListOutput{ - io.get_if>() != nullptr}; - using RawType = typename RealOutputEditing::BinaryFloatingPoint; - bool anyInput{false}; - for (std::size_t j{0}; j < numElements; ++j) { - RawType *x{&ExtractElement(io, descriptor, subscripts)}; - if (isListOutput) { - DataEdit rEdit, iEdit; - rEdit.descriptor = DataEdit::ListDirectedRealPart; - iEdit.descriptor = DataEdit::ListDirectedImaginaryPart; - rEdit.modes = iEdit.modes = io.mutableModes(); - if (!RealOutputEditing{io, x[0]}.Edit(rEdit) || - !RealOutputEditing{io, x[1]}.Edit(iEdit)) { - return false; - } - } else { - for (int k{0}; k < 2; ++k, ++x) { - auto edit{io.GetNextDataEdit()}; - if (!edit) { - return false; - } else if constexpr (DIR == Direction::Output) { - if (!RealOutputEditing{io, *x}.Edit(*edit)) { - return false; - } - } else if (edit->descriptor == DataEdit::ListDirectedNullValue) { - break; - } else if (EditRealInput( - io, *edit, reinterpret_cast(x))) { - anyInput = true; - } else { - return anyInput && edit->IsNamelist(); - } - } - } - if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { - io.GetIoErrorHandler().Crash( - "FormattedComplexIO: subscripts out of bounds"); - } - } - return true; -} - -template -inline RT_API_ATTRS bool FormattedCharacterIO( - IoStatementState &io, const Descriptor &descriptor) { - std::size_t numElements{descriptor.Elements()}; - SubscriptValue subscripts[maxRank]; - descriptor.GetLowerBounds(subscripts); - std::size_t length{descriptor.ElementBytes() / sizeof(A)}; - auto *listOutput{io.get_if>()}; - bool anyInput{false}; - for (std::size_t j{0}; j < numElements; ++j) { - A *x{&ExtractElement(io, descriptor, subscripts)}; - if (listOutput) { - if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) { - return false; - } - } else if (auto edit{io.GetNextDataEdit()}) { - if constexpr (DIR == Direction::Output) { - if (!EditCharacterOutput(io, *edit, x, length)) { - return false; - } - } else { // input - if (edit->descriptor != DataEdit::ListDirectedNullValue) { - if (EditCharacterInput(io, *edit, x, length)) { - anyInput = true; - } else { - return anyInput && edit->IsNamelist(); - } - } - } - } else { - return false; - } - if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { - io.GetIoErrorHandler().Crash( - "FormattedCharacterIO: subscripts out of bounds"); - } - } - return true; -} - -template -inline RT_API_ATTRS bool FormattedLogicalIO( - IoStatementState &io, const Descriptor &descriptor) { - std::size_t numElements{descriptor.Elements()}; - SubscriptValue subscripts[maxRank]; - descriptor.GetLowerBounds(subscripts); - auto *listOutput{io.get_if>()}; - using IntType = CppTypeFor; - bool anyInput{false}; - for (std::size_t j{0}; j < numElements; ++j) { - IntType &x{ExtractElement(io, descriptor, subscripts)}; - if (listOutput) { - if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) { - return false; - } - } else if (auto edit{io.GetNextDataEdit()}) { - if constexpr (DIR == Direction::Output) { - if (!EditLogicalOutput(io, *edit, x != 0)) { - return false; - } - } else { - if (edit->descriptor != DataEdit::ListDirectedNullValue) { - bool truth{}; - if (EditLogicalInput(io, *edit, truth)) { - x = truth; - anyInput = true; - } else { - return anyInput && edit->IsNamelist(); - } - } - } - } else { - return false; - } - if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { - io.GetIoErrorHandler().Crash( - "FormattedLogicalIO: subscripts out of bounds"); - } - } - return true; -} - -template -RT_API_ATTRS int DerivedIoTicket::Continue(WorkQueue &workQueue) { - while (!IsComplete()) { - if (component_->genre() == typeInfo::Component::Genre::Data) { - // Create a descriptor for the component - Descriptor &compDesc{componentDescriptor_.descriptor()}; - component_->CreatePointerDescriptor( - compDesc, instance_, io_.GetIoErrorHandler(), subscripts_); - Advance(); - if (int status{workQueue.BeginDescriptorIo( - io_, compDesc, table_, anyIoTookPlace_)}; - status != StatOk) { - return status; - } - } else { - // Component is itself a descriptor - char *pointer{ - instance_.Element(subscripts_) + component_->offset()}; - const Descriptor &compDesc{ - *reinterpret_cast(pointer)}; - Advance(); - if (compDesc.IsAllocated()) { - if (int status{workQueue.BeginDescriptorIo( - io_, compDesc, table_, anyIoTookPlace_)}; - status != StatOk) { - return status; - } - } - } - } - return StatOk; -} - -template RT_API_ATTRS int DerivedIoTicket::Continue( - WorkQueue &); -template RT_API_ATTRS int DerivedIoTicket::Continue( - WorkQueue &); - -template -RT_API_ATTRS int DescriptorIoTicket::Begin(WorkQueue &workQueue) { - IoErrorHandler &handler{io_.GetIoErrorHandler()}; - if (handler.InError()) { - return handler.GetIoStat(); - } - if (!io_.get_if>()) { - handler.Crash("DescriptorIO() called for wrong I/O direction"); - return handler.GetIoStat(); - } - if constexpr (DIR == Direction::Input) { - if (!io_.BeginReadingRecord()) { - return StatOk; - } - } - if (!io_.get_if>()) { - // Unformatted I/O - IoErrorHandler &handler{io_.GetIoErrorHandler()}; - const DescriptorAddendum *addendum{instance_.Addendum()}; - if (const typeInfo::DerivedType *type{ - addendum ? addendum->derivedType() : nullptr}) { - // derived type unformatted I/O - if (table_) { - if (const auto *definedIo{table_->Find(*type, - DIR == Direction::Input - ? common::DefinedIo::ReadUnformatted - : common::DefinedIo::WriteUnformatted)}) { - if (definedIo->subroutine) { - typeInfo::SpecialBinding special{DIR == Direction::Input - ? typeInfo::SpecialBinding::Which::ReadUnformatted - : typeInfo::SpecialBinding::Which::WriteUnformatted, - definedIo->subroutine, definedIo->isDtvArgPolymorphic, false, - false}; - if (DefinedUnformattedIo(io_, instance_, *type, special)) { - anyIoTookPlace_ = true; - return StatOk; - } - } else { - int status{workQueue.BeginDerivedIo( - io_, instance_, *type, table_, anyIoTookPlace_)}; - return status == StatContinue ? StatOk : status; // done here - } - } - } - if (const typeInfo::SpecialBinding *special{ - type->FindSpecialBinding(DIR == Direction::Input - ? typeInfo::SpecialBinding::Which::ReadUnformatted - : typeInfo::SpecialBinding::Which::WriteUnformatted)}) { - if (!table_ || !table_->ignoreNonTbpEntries || special->isTypeBound()) { - // defined derived type unformatted I/O - if (DefinedUnformattedIo(io_, instance_, *type, *special)) { - anyIoTookPlace_ = true; - return StatOk; - } else { - return IostatEnd; - } - } - } - // Default derived type unformatted I/O - // TODO: If no component at any level has defined READ or WRITE - // (as appropriate), the elements are contiguous, and no byte swapping - // is active, do a block transfer via the code below. - int status{workQueue.BeginDerivedIo( - io_, instance_, *type, table_, anyIoTookPlace_)}; - return status == StatContinue ? StatOk : status; // done here - } else { - // intrinsic type unformatted I/O - auto *externalUnf{io_.get_if>()}; - ChildUnformattedIoStatementState *childUnf{nullptr}; - InquireIOLengthState *inq{nullptr}; - bool swapEndianness{false}; - if (externalUnf) { - swapEndianness = externalUnf->unit().swapEndianness(); - } else { - childUnf = io_.get_if>(); - if (!childUnf) { - inq = DIR == Direction::Output ? io_.get_if() - : nullptr; - RUNTIME_CHECK(handler, inq != nullptr); - } - } - std::size_t elementBytes{instance_.ElementBytes()}; - std::size_t swappingBytes{elementBytes}; - if (auto maybeCatAndKind{instance_.type().GetCategoryAndKind()}) { - // Byte swapping units can be smaller than elements, namely - // for COMPLEX and CHARACTER. - if (maybeCatAndKind->first == TypeCategory::Character) { - // swap each character position independently - swappingBytes = maybeCatAndKind->second; // kind - } else if (maybeCatAndKind->first == TypeCategory::Complex) { - // swap real and imaginary components independently - swappingBytes /= 2; - } - } - using CharType = - std::conditional_t; - auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool { - if constexpr (DIR == Direction::Output) { - return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes) - : childUnf ? childUnf->Emit(&x, totalBytes, swappingBytes) - : inq->Emit(&x, totalBytes, swappingBytes); - } else { - return externalUnf - ? externalUnf->Receive(&x, totalBytes, swappingBytes) - : childUnf->Receive(&x, totalBytes, swappingBytes); - } - }}; - if (!swapEndianness && - instance_.IsContiguous()) { // contiguous unformatted I/O - char &x{ExtractElement(io_, instance_, subscripts_)}; - if (Transfer(x, elements_ * elementBytes)) { - anyIoTookPlace_ = true; - } else { - return IostatEnd; - } - } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O - for (; !IsComplete(); Advance()) { - char &x{ExtractElement(io_, instance_, subscripts_)}; - if (Transfer(x, elementBytes)) { - anyIoTookPlace_ = true; - } else { - return IostatEnd; - } - } - } - } - // Unformatted I/O never needs to call Continue(). - return StatOk; - } - // Formatted I/O - if (auto catAndKind{instance_.type().GetCategoryAndKind()}) { - TypeCategory cat{catAndKind->first}; - int kind{catAndKind->second}; - bool any{false}; - switch (cat) { - case TypeCategory::Integer: - switch (kind) { - case 1: - any = FormattedIntegerIO<1, DIR>(io_, instance_, true); - break; - case 2: - any = FormattedIntegerIO<2, DIR>(io_, instance_, true); - break; - case 4: - any = FormattedIntegerIO<4, DIR>(io_, instance_, true); - break; - case 8: - any = FormattedIntegerIO<8, DIR>(io_, instance_, true); - break; - case 16: - any = FormattedIntegerIO<16, DIR>(io_, instance_, true); - break; - default: - handler.Crash( - "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind); - return IostatEnd; - } - break; - case TypeCategory::Unsigned: - switch (kind) { - case 1: - any = FormattedIntegerIO<1, DIR>(io_, instance_, false); - break; - case 2: - any = FormattedIntegerIO<2, DIR>(io_, instance_, false); - break; - case 4: - any = FormattedIntegerIO<4, DIR>(io_, instance_, false); - break; - case 8: - any = FormattedIntegerIO<8, DIR>(io_, instance_, false); - break; - case 16: - any = FormattedIntegerIO<16, DIR>(io_, instance_, false); - break; - default: - handler.Crash( - "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind); - return IostatEnd; - } - break; - case TypeCategory::Real: - switch (kind) { - case 2: - any = FormattedRealIO<2, DIR>(io_, instance_); - break; - case 3: - any = FormattedRealIO<3, DIR>(io_, instance_); - break; - case 4: - any = FormattedRealIO<4, DIR>(io_, instance_); - break; - case 8: - any = FormattedRealIO<8, DIR>(io_, instance_); - break; - case 10: - any = FormattedRealIO<10, DIR>(io_, instance_); - break; - // TODO: case double/double - case 16: - any = FormattedRealIO<16, DIR>(io_, instance_); - break; - default: - handler.Crash( - "not yet implemented: REAL(KIND=%d) in formatted IO", kind); - return IostatEnd; - } - break; - case TypeCategory::Complex: - switch (kind) { - case 2: - any = FormattedComplexIO<2, DIR>(io_, instance_); - break; - case 3: - any = FormattedComplexIO<3, DIR>(io_, instance_); - break; - case 4: - any = FormattedComplexIO<4, DIR>(io_, instance_); - break; - case 8: - any = FormattedComplexIO<8, DIR>(io_, instance_); - break; - case 10: - any = FormattedComplexIO<10, DIR>(io_, instance_); - break; - // TODO: case double/double - case 16: - any = FormattedComplexIO<16, DIR>(io_, instance_); - break; - default: - handler.Crash( - "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind); - return IostatEnd; - } - break; - case TypeCategory::Character: - switch (kind) { - case 1: - any = FormattedCharacterIO(io_, instance_); - break; - case 2: - any = FormattedCharacterIO(io_, instance_); - break; - case 4: - any = FormattedCharacterIO(io_, instance_); - break; - default: - handler.Crash( - "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind); - return IostatEnd; - } - break; - case TypeCategory::Logical: - switch (kind) { - case 1: - any = FormattedLogicalIO<1, DIR>(io_, instance_); - break; - case 2: - any = FormattedLogicalIO<2, DIR>(io_, instance_); - break; - case 4: - any = FormattedLogicalIO<4, DIR>(io_, instance_); - break; - case 8: - any = FormattedLogicalIO<8, DIR>(io_, instance_); - break; - default: - handler.Crash( - "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind); - return IostatEnd; - } - break; - case TypeCategory::Derived: { - // Derived type information must be present for formatted I/O. - IoErrorHandler &handler{io_.GetIoErrorHandler()}; - const DescriptorAddendum *addendum{instance_.Addendum()}; - RUNTIME_CHECK(handler, addendum != nullptr); - derived_ = addendum->derivedType(); - RUNTIME_CHECK(handler, derived_ != nullptr); - if (table_) { - if (const auto *definedIo{table_->Find(*derived_, - DIR == Direction::Input ? common::DefinedIo::ReadFormatted - : common::DefinedIo::WriteFormatted)}) { - if (definedIo->subroutine) { - nonTbpSpecial_.emplace(DIR == Direction::Input - ? typeInfo::SpecialBinding::Which::ReadFormatted - : typeInfo::SpecialBinding::Which::WriteFormatted, - definedIo->subroutine, definedIo->isDtvArgPolymorphic, false, - false); - special_ = &*nonTbpSpecial_; - } - } - } - if (!special_) { - if (const typeInfo::SpecialBinding *binding{ - derived_->FindSpecialBinding(DIR == Direction::Input - ? typeInfo::SpecialBinding::Which::ReadFormatted - : typeInfo::SpecialBinding::Which::WriteFormatted)}) { - if (!table_ || !table_->ignoreNonTbpEntries || - binding->isTypeBound()) { - special_ = binding; - } - } - } - return StatContinue; - } - } - if (any) { - anyIoTookPlace_ = true; - } else { - return IostatEnd; - } - } else { - handler.Crash("DescriptorIO: bad type code (%d) in descriptor", - static_cast(instance_.type().raw())); - return handler.GetIoStat(); - } - return StatOk; -} - -template RT_API_ATTRS int DescriptorIoTicket::Begin( - WorkQueue &); -template RT_API_ATTRS int DescriptorIoTicket::Begin( - WorkQueue &); - -template -RT_API_ATTRS int DescriptorIoTicket::Continue(WorkQueue &workQueue) { - // Only derived type formatted I/O gets here. - while (!IsComplete()) { - if (special_) { - if (auto defined{DefinedFormattedIo( - io_, instance_, *derived_, *special_, subscripts_)}) { - anyIoTookPlace_ |= *defined; - Advance(); - continue; - } - } - Descriptor &elementDesc{elementDescriptor_.descriptor()}; - elementDesc.Establish( - *derived_, nullptr, 0, nullptr, CFI_attribute_pointer); - elementDesc.set_base_addr(instance_.Element(subscripts_)); - Advance(); - if (int status{workQueue.BeginDerivedIo( - io_, elementDesc, *derived_, table_, anyIoTookPlace_)}; - status != StatOk) { - return status; - } - } - return StatOk; -} - -template RT_API_ATTRS int DescriptorIoTicket::Continue( - WorkQueue &); -template RT_API_ATTRS int DescriptorIoTicket::Continue( - WorkQueue &); - -template -RT_API_ATTRS bool DescriptorIO(IoStatementState &io, - const Descriptor &descriptor, const NonTbpDefinedIoTable *table) { - bool anyIoTookPlace{false}; - WorkQueue workQueue{io.GetIoErrorHandler()}; - if (workQueue.BeginDescriptorIo(io, descriptor, table, anyIoTookPlace) == - StatContinue) { - workQueue.Run(); - } - return anyIoTookPlace; -} - -template RT_API_ATTRS bool DescriptorIO( - IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *); -template RT_API_ATTRS bool DescriptorIO( - IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *); - RT_OFFLOAD_API_GROUP_END } // namespace Fortran::runtime::io::descr diff --git a/flang-rt/lib/runtime/descriptor-io.h b/flang-rt/lib/runtime/descriptor-io.h index 88ad59bd24b53..eb60f106c9203 100644 --- a/flang-rt/lib/runtime/descriptor-io.h +++ b/flang-rt/lib/runtime/descriptor-io.h @@ -9,27 +9,619 @@ #ifndef FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_ #define FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_ -#include "flang-rt/runtime/connection.h" +// Implementation of I/O data list item transfers based on descriptors. +// (All I/O items come through here so that the code is exercised for test; +// some scalar I/O data transfer APIs could be changed to bypass their use +// of descriptors in the future for better efficiency.) -namespace Fortran::runtime { -class Descriptor; -} // namespace Fortran::runtime - -namespace Fortran::runtime::io { -class IoStatementState; -struct NonTbpDefinedIoTable; -} // namespace Fortran::runtime::io +#include "edit-input.h" +#include "edit-output.h" +#include "unit.h" +#include "flang-rt/runtime/descriptor.h" +#include "flang-rt/runtime/io-stmt.h" +#include "flang-rt/runtime/namelist.h" +#include "flang-rt/runtime/terminator.h" +#include "flang-rt/runtime/type-info.h" +#include "flang/Common/optional.h" +#include "flang/Common/uint128.h" +#include "flang/Runtime/cpp-type.h" namespace Fortran::runtime::io::descr { +template +inline RT_API_ATTRS A &ExtractElement(IoStatementState &io, + const Descriptor &descriptor, const SubscriptValue subscripts[]) { + A *p{descriptor.Element(subscripts)}; + if (!p) { + io.GetIoErrorHandler().Crash("Bad address for I/O item -- null base " + "address or subscripts out of range"); + } + return *p; +} + +// Per-category descriptor-based I/O templates + +// TODO (perhaps as a nontrivial but small starter project): implement +// automatic repetition counts, like "10*3.14159", for list-directed and +// NAMELIST array output. + +template +inline RT_API_ATTRS bool FormattedIntegerIO(IoStatementState &io, + const Descriptor &descriptor, [[maybe_unused]] bool isSigned) { + std::size_t numElements{descriptor.Elements()}; + SubscriptValue subscripts[maxRank]; + descriptor.GetLowerBounds(subscripts); + using IntType = CppTypeFor; + bool anyInput{false}; + for (std::size_t j{0}; j < numElements; ++j) { + if (auto edit{io.GetNextDataEdit()}) { + IntType &x{ExtractElement(io, descriptor, subscripts)}; + if constexpr (DIR == Direction::Output) { + if (!EditIntegerOutput(io, *edit, x, isSigned)) { + return false; + } + } else if (edit->descriptor != DataEdit::ListDirectedNullValue) { + if (EditIntegerInput( + io, *edit, reinterpret_cast(&x), KIND, isSigned)) { + anyInput = true; + } else { + return anyInput && edit->IsNamelist(); + } + } + if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { + io.GetIoErrorHandler().Crash( + "FormattedIntegerIO: subscripts out of bounds"); + } + } else { + return false; + } + } + return true; +} + +template +inline RT_API_ATTRS bool FormattedRealIO( + IoStatementState &io, const Descriptor &descriptor) { + std::size_t numElements{descriptor.Elements()}; + SubscriptValue subscripts[maxRank]; + descriptor.GetLowerBounds(subscripts); + using RawType = typename RealOutputEditing::BinaryFloatingPoint; + bool anyInput{false}; + for (std::size_t j{0}; j < numElements; ++j) { + if (auto edit{io.GetNextDataEdit()}) { + RawType &x{ExtractElement(io, descriptor, subscripts)}; + if constexpr (DIR == Direction::Output) { + if (!RealOutputEditing{io, x}.Edit(*edit)) { + return false; + } + } else if (edit->descriptor != DataEdit::ListDirectedNullValue) { + if (EditRealInput(io, *edit, reinterpret_cast(&x))) { + anyInput = true; + } else { + return anyInput && edit->IsNamelist(); + } + } + if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { + io.GetIoErrorHandler().Crash( + "FormattedRealIO: subscripts out of bounds"); + } + } else { + return false; + } + } + return true; +} + +template +inline RT_API_ATTRS bool FormattedComplexIO( + IoStatementState &io, const Descriptor &descriptor) { + std::size_t numElements{descriptor.Elements()}; + SubscriptValue subscripts[maxRank]; + descriptor.GetLowerBounds(subscripts); + bool isListOutput{ + io.get_if>() != nullptr}; + using RawType = typename RealOutputEditing::BinaryFloatingPoint; + bool anyInput{false}; + for (std::size_t j{0}; j < numElements; ++j) { + RawType *x{&ExtractElement(io, descriptor, subscripts)}; + if (isListOutput) { + DataEdit rEdit, iEdit; + rEdit.descriptor = DataEdit::ListDirectedRealPart; + iEdit.descriptor = DataEdit::ListDirectedImaginaryPart; + rEdit.modes = iEdit.modes = io.mutableModes(); + if (!RealOutputEditing{io, x[0]}.Edit(rEdit) || + !RealOutputEditing{io, x[1]}.Edit(iEdit)) { + return false; + } + } else { + for (int k{0}; k < 2; ++k, ++x) { + auto edit{io.GetNextDataEdit()}; + if (!edit) { + return false; + } else if constexpr (DIR == Direction::Output) { + if (!RealOutputEditing{io, *x}.Edit(*edit)) { + return false; + } + } else if (edit->descriptor == DataEdit::ListDirectedNullValue) { + break; + } else if (EditRealInput( + io, *edit, reinterpret_cast(x))) { + anyInput = true; + } else { + return anyInput && edit->IsNamelist(); + } + } + } + if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { + io.GetIoErrorHandler().Crash( + "FormattedComplexIO: subscripts out of bounds"); + } + } + return true; +} + +template +inline RT_API_ATTRS bool FormattedCharacterIO( + IoStatementState &io, const Descriptor &descriptor) { + std::size_t numElements{descriptor.Elements()}; + SubscriptValue subscripts[maxRank]; + descriptor.GetLowerBounds(subscripts); + std::size_t length{descriptor.ElementBytes() / sizeof(A)}; + auto *listOutput{io.get_if>()}; + bool anyInput{false}; + for (std::size_t j{0}; j < numElements; ++j) { + A *x{&ExtractElement(io, descriptor, subscripts)}; + if (listOutput) { + if (!ListDirectedCharacterOutput(io, *listOutput, x, length)) { + return false; + } + } else if (auto edit{io.GetNextDataEdit()}) { + if constexpr (DIR == Direction::Output) { + if (!EditCharacterOutput(io, *edit, x, length)) { + return false; + } + } else { // input + if (edit->descriptor != DataEdit::ListDirectedNullValue) { + if (EditCharacterInput(io, *edit, x, length)) { + anyInput = true; + } else { + return anyInput && edit->IsNamelist(); + } + } + } + } else { + return false; + } + if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { + io.GetIoErrorHandler().Crash( + "FormattedCharacterIO: subscripts out of bounds"); + } + } + return true; +} + +template +inline RT_API_ATTRS bool FormattedLogicalIO( + IoStatementState &io, const Descriptor &descriptor) { + std::size_t numElements{descriptor.Elements()}; + SubscriptValue subscripts[maxRank]; + descriptor.GetLowerBounds(subscripts); + auto *listOutput{io.get_if>()}; + using IntType = CppTypeFor; + bool anyInput{false}; + for (std::size_t j{0}; j < numElements; ++j) { + IntType &x{ExtractElement(io, descriptor, subscripts)}; + if (listOutput) { + if (!ListDirectedLogicalOutput(io, *listOutput, x != 0)) { + return false; + } + } else if (auto edit{io.GetNextDataEdit()}) { + if constexpr (DIR == Direction::Output) { + if (!EditLogicalOutput(io, *edit, x != 0)) { + return false; + } + } else { + if (edit->descriptor != DataEdit::ListDirectedNullValue) { + bool truth{}; + if (EditLogicalInput(io, *edit, truth)) { + x = truth; + anyInput = true; + } else { + return anyInput && edit->IsNamelist(); + } + } + } + } else { + return false; + } + if (!descriptor.IncrementSubscripts(subscripts) && j + 1 < numElements) { + io.GetIoErrorHandler().Crash( + "FormattedLogicalIO: subscripts out of bounds"); + } + } + return true; +} template -RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &, +static RT_API_ATTRS bool DescriptorIO(IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable * = nullptr); -extern template RT_API_ATTRS bool DescriptorIO( - IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *); -extern template RT_API_ATTRS bool DescriptorIO( - IoStatementState &, const Descriptor &, const NonTbpDefinedIoTable *); +// For intrinsic (not defined) derived type I/O, formatted & unformatted +template +static RT_API_ATTRS bool DefaultComponentIO(IoStatementState &io, + const typeInfo::Component &component, const Descriptor &origDescriptor, + const SubscriptValue origSubscripts[], Terminator &terminator, + const NonTbpDefinedIoTable *table) { +#if !defined(RT_DEVICE_AVOID_RECURSION) + if (component.genre() == typeInfo::Component::Genre::Data) { + // Create a descriptor for the component + StaticDescriptor statDesc; + Descriptor &desc{statDesc.descriptor()}; + component.CreatePointerDescriptor( + desc, origDescriptor, terminator, origSubscripts); + return DescriptorIO(io, desc, table); + } else { + // Component is itself a descriptor + char *pointer{ + origDescriptor.Element(origSubscripts) + component.offset()}; + const Descriptor &compDesc{*reinterpret_cast(pointer)}; + return compDesc.IsAllocated() && DescriptorIO(io, compDesc, table); + } +#else + terminator.Crash("not yet implemented: component IO"); +#endif +} + +template +static RT_API_ATTRS bool DefaultComponentwiseFormattedIO(IoStatementState &io, + const Descriptor &descriptor, const typeInfo::DerivedType &type, + const NonTbpDefinedIoTable *table, const SubscriptValue subscripts[]) { + IoErrorHandler &handler{io.GetIoErrorHandler()}; + const Descriptor &compArray{type.component()}; + RUNTIME_CHECK(handler, compArray.rank() == 1); + std::size_t numComponents{compArray.Elements()}; + SubscriptValue at[maxRank]; + compArray.GetLowerBounds(at); + for (std::size_t k{0}; k < numComponents; + ++k, compArray.IncrementSubscripts(at)) { + const typeInfo::Component &component{ + *compArray.Element(at)}; + if (!DefaultComponentIO( + io, component, descriptor, subscripts, handler, table)) { + // Return true for NAMELIST input if any component appeared. + auto *listInput{ + io.get_if>()}; + return DIR == Direction::Input && k > 0 && listInput && + listInput->inNamelistSequence(); + } + } + return true; +} + +template +static RT_API_ATTRS bool DefaultComponentwiseUnformattedIO(IoStatementState &io, + const Descriptor &descriptor, const typeInfo::DerivedType &type, + const NonTbpDefinedIoTable *table) { + IoErrorHandler &handler{io.GetIoErrorHandler()}; + const Descriptor &compArray{type.component()}; + RUNTIME_CHECK(handler, compArray.rank() == 1); + std::size_t numComponents{compArray.Elements()}; + std::size_t numElements{descriptor.Elements()}; + SubscriptValue subscripts[maxRank]; + descriptor.GetLowerBounds(subscripts); + for (std::size_t j{0}; j < numElements; + ++j, descriptor.IncrementSubscripts(subscripts)) { + SubscriptValue at[maxRank]; + compArray.GetLowerBounds(at); + for (std::size_t k{0}; k < numComponents; + ++k, compArray.IncrementSubscripts(at)) { + const typeInfo::Component &component{ + *compArray.Element(at)}; + if (!DefaultComponentIO( + io, component, descriptor, subscripts, handler, table)) { + return false; + } + } + } + return true; +} + +RT_API_ATTRS Fortran::common::optional DefinedFormattedIo( + IoStatementState &, const Descriptor &, const typeInfo::DerivedType &, + const typeInfo::SpecialBinding &, const SubscriptValue[]); + +template +static RT_API_ATTRS bool FormattedDerivedTypeIO(IoStatementState &io, + const Descriptor &descriptor, const NonTbpDefinedIoTable *table) { + IoErrorHandler &handler{io.GetIoErrorHandler()}; + // Derived type information must be present for formatted I/O. + const DescriptorAddendum *addendum{descriptor.Addendum()}; + RUNTIME_CHECK(handler, addendum != nullptr); + const typeInfo::DerivedType *type{addendum->derivedType()}; + RUNTIME_CHECK(handler, type != nullptr); + Fortran::common::optional nonTbpSpecial; + const typeInfo::SpecialBinding *special{nullptr}; + if (table) { + if (const auto *definedIo{table->Find(*type, + DIR == Direction::Input ? common::DefinedIo::ReadFormatted + : common::DefinedIo::WriteFormatted)}) { + if (definedIo->subroutine) { + nonTbpSpecial.emplace(DIR == Direction::Input + ? typeInfo::SpecialBinding::Which::ReadFormatted + : typeInfo::SpecialBinding::Which::WriteFormatted, + definedIo->subroutine, definedIo->isDtvArgPolymorphic, false, + false); + special = &*nonTbpSpecial; + } + } + } + if (!special) { + if (const typeInfo::SpecialBinding * + binding{type->FindSpecialBinding(DIR == Direction::Input + ? typeInfo::SpecialBinding::Which::ReadFormatted + : typeInfo::SpecialBinding::Which::WriteFormatted)}) { + if (!table || !table->ignoreNonTbpEntries || binding->isTypeBound()) { + special = binding; + } + } + } + SubscriptValue subscripts[maxRank]; + descriptor.GetLowerBounds(subscripts); + std::size_t numElements{descriptor.Elements()}; + for (std::size_t j{0}; j < numElements; + ++j, descriptor.IncrementSubscripts(subscripts)) { + Fortran::common::optional result; + if (special) { + result = DefinedFormattedIo(io, descriptor, *type, *special, subscripts); + } + if (!result) { + result = DefaultComponentwiseFormattedIO( + io, descriptor, *type, table, subscripts); + } + if (!result.value()) { + // Return true for NAMELIST input if we got anything. + auto *listInput{ + io.get_if>()}; + return DIR == Direction::Input && j > 0 && listInput && + listInput->inNamelistSequence(); + } + } + return true; +} + +RT_API_ATTRS bool DefinedUnformattedIo(IoStatementState &, const Descriptor &, + const typeInfo::DerivedType &, const typeInfo::SpecialBinding &); +// Unformatted I/O +template +static RT_API_ATTRS bool UnformattedDescriptorIO(IoStatementState &io, + const Descriptor &descriptor, const NonTbpDefinedIoTable *table = nullptr) { + IoErrorHandler &handler{io.GetIoErrorHandler()}; + const DescriptorAddendum *addendum{descriptor.Addendum()}; + if (const typeInfo::DerivedType * + type{addendum ? addendum->derivedType() : nullptr}) { + // derived type unformatted I/O + if (table) { + if (const auto *definedIo{table->Find(*type, + DIR == Direction::Input ? common::DefinedIo::ReadUnformatted + : common::DefinedIo::WriteUnformatted)}) { + if (definedIo->subroutine) { + typeInfo::SpecialBinding special{DIR == Direction::Input + ? typeInfo::SpecialBinding::Which::ReadUnformatted + : typeInfo::SpecialBinding::Which::WriteUnformatted, + definedIo->subroutine, definedIo->isDtvArgPolymorphic, false, + false}; + if (Fortran::common::optional wasDefined{ + DefinedUnformattedIo(io, descriptor, *type, special)}) { + return *wasDefined; + } + } else { + return DefaultComponentwiseUnformattedIO( + io, descriptor, *type, table); + } + } + } + if (const typeInfo::SpecialBinding * + special{type->FindSpecialBinding(DIR == Direction::Input + ? typeInfo::SpecialBinding::Which::ReadUnformatted + : typeInfo::SpecialBinding::Which::WriteUnformatted)}) { + if (!table || !table->ignoreNonTbpEntries || special->isTypeBound()) { + // defined derived type unformatted I/O + return DefinedUnformattedIo(io, descriptor, *type, *special); + } + } + // Default derived type unformatted I/O + // TODO: If no component at any level has defined READ or WRITE + // (as appropriate), the elements are contiguous, and no byte swapping + // is active, do a block transfer via the code below. + return DefaultComponentwiseUnformattedIO(io, descriptor, *type, table); + } else { + // intrinsic type unformatted I/O + auto *externalUnf{io.get_if>()}; + auto *childUnf{io.get_if>()}; + auto *inq{ + DIR == Direction::Output ? io.get_if() : nullptr}; + RUNTIME_CHECK(handler, externalUnf || childUnf || inq); + std::size_t elementBytes{descriptor.ElementBytes()}; + std::size_t numElements{descriptor.Elements()}; + std::size_t swappingBytes{elementBytes}; + if (auto maybeCatAndKind{descriptor.type().GetCategoryAndKind()}) { + // Byte swapping units can be smaller than elements, namely + // for COMPLEX and CHARACTER. + if (maybeCatAndKind->first == TypeCategory::Character) { + // swap each character position independently + swappingBytes = maybeCatAndKind->second; // kind + } else if (maybeCatAndKind->first == TypeCategory::Complex) { + // swap real and imaginary components independently + swappingBytes /= 2; + } + } + SubscriptValue subscripts[maxRank]; + descriptor.GetLowerBounds(subscripts); + using CharType = + std::conditional_t; + auto Transfer{[=](CharType &x, std::size_t totalBytes) -> bool { + if constexpr (DIR == Direction::Output) { + return externalUnf ? externalUnf->Emit(&x, totalBytes, swappingBytes) + : childUnf ? childUnf->Emit(&x, totalBytes, swappingBytes) + : inq->Emit(&x, totalBytes, swappingBytes); + } else { + return externalUnf ? externalUnf->Receive(&x, totalBytes, swappingBytes) + : childUnf->Receive(&x, totalBytes, swappingBytes); + } + }}; + bool swapEndianness{externalUnf && externalUnf->unit().swapEndianness()}; + if (!swapEndianness && + descriptor.IsContiguous()) { // contiguous unformatted I/O + char &x{ExtractElement(io, descriptor, subscripts)}; + return Transfer(x, numElements * elementBytes); + } else { // non-contiguous or byte-swapped intrinsic type unformatted I/O + for (std::size_t j{0}; j < numElements; ++j) { + char &x{ExtractElement(io, descriptor, subscripts)}; + if (!Transfer(x, elementBytes)) { + return false; + } + if (!descriptor.IncrementSubscripts(subscripts) && + j + 1 < numElements) { + handler.Crash("DescriptorIO: subscripts out of bounds"); + } + } + return true; + } + } +} + +template +static RT_API_ATTRS bool DescriptorIO(IoStatementState &io, + const Descriptor &descriptor, const NonTbpDefinedIoTable *table) { + IoErrorHandler &handler{io.GetIoErrorHandler()}; + if (handler.InError()) { + return false; + } + if (!io.get_if>()) { + handler.Crash("DescriptorIO() called for wrong I/O direction"); + return false; + } + if constexpr (DIR == Direction::Input) { + if (!io.BeginReadingRecord()) { + return false; + } + } + if (!io.get_if>()) { + return UnformattedDescriptorIO(io, descriptor, table); + } + if (auto catAndKind{descriptor.type().GetCategoryAndKind()}) { + TypeCategory cat{catAndKind->first}; + int kind{catAndKind->second}; + switch (cat) { + case TypeCategory::Integer: + switch (kind) { + case 1: + return FormattedIntegerIO<1, DIR>(io, descriptor, true); + case 2: + return FormattedIntegerIO<2, DIR>(io, descriptor, true); + case 4: + return FormattedIntegerIO<4, DIR>(io, descriptor, true); + case 8: + return FormattedIntegerIO<8, DIR>(io, descriptor, true); + case 16: + return FormattedIntegerIO<16, DIR>(io, descriptor, true); + default: + handler.Crash( + "not yet implemented: INTEGER(KIND=%d) in formatted IO", kind); + return false; + } + case TypeCategory::Unsigned: + switch (kind) { + case 1: + return FormattedIntegerIO<1, DIR>(io, descriptor, false); + case 2: + return FormattedIntegerIO<2, DIR>(io, descriptor, false); + case 4: + return FormattedIntegerIO<4, DIR>(io, descriptor, false); + case 8: + return FormattedIntegerIO<8, DIR>(io, descriptor, false); + case 16: + return FormattedIntegerIO<16, DIR>(io, descriptor, false); + default: + handler.Crash( + "not yet implemented: UNSIGNED(KIND=%d) in formatted IO", kind); + return false; + } + case TypeCategory::Real: + switch (kind) { + case 2: + return FormattedRealIO<2, DIR>(io, descriptor); + case 3: + return FormattedRealIO<3, DIR>(io, descriptor); + case 4: + return FormattedRealIO<4, DIR>(io, descriptor); + case 8: + return FormattedRealIO<8, DIR>(io, descriptor); + case 10: + return FormattedRealIO<10, DIR>(io, descriptor); + // TODO: case double/double + case 16: + return FormattedRealIO<16, DIR>(io, descriptor); + default: + handler.Crash( + "not yet implemented: REAL(KIND=%d) in formatted IO", kind); + return false; + } + case TypeCategory::Complex: + switch (kind) { + case 2: + return FormattedComplexIO<2, DIR>(io, descriptor); + case 3: + return FormattedComplexIO<3, DIR>(io, descriptor); + case 4: + return FormattedComplexIO<4, DIR>(io, descriptor); + case 8: + return FormattedComplexIO<8, DIR>(io, descriptor); + case 10: + return FormattedComplexIO<10, DIR>(io, descriptor); + // TODO: case double/double + case 16: + return FormattedComplexIO<16, DIR>(io, descriptor); + default: + handler.Crash( + "not yet implemented: COMPLEX(KIND=%d) in formatted IO", kind); + return false; + } + case TypeCategory::Character: + switch (kind) { + case 1: + return FormattedCharacterIO(io, descriptor); + case 2: + return FormattedCharacterIO(io, descriptor); + case 4: + return FormattedCharacterIO(io, descriptor); + default: + handler.Crash( + "not yet implemented: CHARACTER(KIND=%d) in formatted IO", kind); + return false; + } + case TypeCategory::Logical: + switch (kind) { + case 1: + return FormattedLogicalIO<1, DIR>(io, descriptor); + case 2: + return FormattedLogicalIO<2, DIR>(io, descriptor); + case 4: + return FormattedLogicalIO<4, DIR>(io, descriptor); + case 8: + return FormattedLogicalIO<8, DIR>(io, descriptor); + default: + handler.Crash( + "not yet implemented: LOGICAL(KIND=%d) in formatted IO", kind); + return false; + } + case TypeCategory::Derived: + return FormattedDerivedTypeIO(io, descriptor, table); + } + } + handler.Crash("DescriptorIO: bad type code (%d) in descriptor", + static_cast(descriptor.type().raw())); + return false; +} } // namespace Fortran::runtime::io::descr #endif // FLANG_RT_RUNTIME_DESCRIPTOR_IO_H_ diff --git a/flang-rt/lib/runtime/environment.cpp b/flang-rt/lib/runtime/environment.cpp index 0f0564403c0e2..1d5304254ed0e 100644 --- a/flang-rt/lib/runtime/environment.cpp +++ b/flang-rt/lib/runtime/environment.cpp @@ -143,10 +143,6 @@ void ExecutionEnvironment::Configure(int ac, const char *av[], } } - if (auto *x{std::getenv("FLANG_RT_DEBUG")}) { - internalDebugging = std::strtol(x, nullptr, 10); - } - if (auto *x{std::getenv("ACC_OFFLOAD_STACK_SIZE")}) { char *end; auto n{std::strtoul(x, &end, 10)}; diff --git a/flang-rt/lib/runtime/namelist.cpp b/flang-rt/lib/runtime/namelist.cpp index 1bef387a9771f..b0cf2180fc6d4 100644 --- a/flang-rt/lib/runtime/namelist.cpp +++ b/flang-rt/lib/runtime/namelist.cpp @@ -10,7 +10,6 @@ #include "descriptor-io.h" #include "flang-rt/runtime/emit-encoded.h" #include "flang-rt/runtime/io-stmt.h" -#include "flang-rt/runtime/type-info.h" #include "flang/Runtime/io-api.h" #include #include diff --git a/flang-rt/lib/runtime/tools.cpp b/flang-rt/lib/runtime/tools.cpp index 24d05f369fcbe..b08195cd31e05 100644 --- a/flang-rt/lib/runtime/tools.cpp +++ b/flang-rt/lib/runtime/tools.cpp @@ -205,7 +205,7 @@ RT_API_ATTRS void ShallowCopyInner(const Descriptor &to, const Descriptor &from, // Doing the recursion upwards instead of downwards puts the more common // cases earlier in the if-chain and has a tangible impact on performance. template struct ShallowCopyRankSpecialize { - static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from, + static bool execute(const Descriptor &to, const Descriptor &from, bool toIsContiguous, bool fromIsContiguous) { if (to.rank() == RANK && from.rank() == RANK) { ShallowCopyInner(to, from, toIsContiguous, fromIsContiguous); @@ -217,7 +217,7 @@ template struct ShallowCopyRankSpecialize { }; template struct ShallowCopyRankSpecialize { - static RT_API_ATTRS bool execute(const Descriptor &to, const Descriptor &from, + static bool execute(const Descriptor &to, const Descriptor &from, bool toIsContiguous, bool fromIsContiguous) { return false; } diff --git a/flang-rt/lib/runtime/type-info.cpp b/flang-rt/lib/runtime/type-info.cpp index 451213202acef..82182696d70c6 100644 --- a/flang-rt/lib/runtime/type-info.cpp +++ b/flang-rt/lib/runtime/type-info.cpp @@ -140,11 +140,11 @@ RT_API_ATTRS void Component::CreatePointerDescriptor(Descriptor &descriptor, const SubscriptValue *subscripts) const { RUNTIME_CHECK(terminator, genre_ == Genre::Data); EstablishDescriptor(descriptor, container, terminator); - std::size_t offset{offset_}; if (subscripts) { - offset += container.SubscriptsToByteOffset(subscripts); + descriptor.set_base_addr(container.Element(subscripts) + offset_); + } else { + descriptor.set_base_addr(container.OffsetElement() + offset_); } - descriptor.set_base_addr(container.OffsetElement() + offset); descriptor.raw().attribute = CFI_attribute_pointer; } diff --git a/flang-rt/lib/runtime/work-queue.cpp b/flang-rt/lib/runtime/work-queue.cpp deleted file mode 100644 index a508ecb637102..0000000000000 --- a/flang-rt/lib/runtime/work-queue.cpp +++ /dev/null @@ -1,161 +0,0 @@ -//===-- lib/runtime/work-queue.cpp ------------------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#include "flang-rt/runtime/work-queue.h" -#include "flang-rt/runtime/environment.h" -#include "flang-rt/runtime/memory.h" -#include "flang-rt/runtime/type-info.h" -#include "flang/Common/visit.h" - -namespace Fortran::runtime { - -#if !defined(RT_DEVICE_COMPILATION) -// FLANG_RT_DEBUG code is disabled when false. -static constexpr bool enableDebugOutput{false}; -#endif - -RT_OFFLOAD_API_GROUP_BEGIN - -RT_API_ATTRS Componentwise::Componentwise(const typeInfo::DerivedType &derived) - : derived_{derived}, components_{derived_.component().Elements()} { - GetComponent(); -} - -RT_API_ATTRS void Componentwise::GetComponent() { - if (IsComplete()) { - component_ = nullptr; - } else { - const Descriptor &componentDesc{derived_.component()}; - component_ = componentDesc.ZeroBasedIndexedElement( - componentAt_); - } -} - -RT_API_ATTRS int Ticket::Continue(WorkQueue &workQueue) { - if (!begun) { - begun = true; - return common::visit( - [&workQueue]( - auto &specificTicket) { return specificTicket.Begin(workQueue); }, - u); - } else { - return common::visit( - [&workQueue](auto &specificTicket) { - return specificTicket.Continue(workQueue); - }, - u); - } -} - -RT_API_ATTRS WorkQueue::~WorkQueue() { - if (last_) { - if ((last_->next = firstFree_)) { - last_->next->previous = last_; - } - firstFree_ = first_; - first_ = last_ = nullptr; - } - while (firstFree_) { - TicketList *next{firstFree_->next}; - if (!firstFree_->isStatic) { - FreeMemory(firstFree_); - } - firstFree_ = next; - } -} - -RT_API_ATTRS Ticket &WorkQueue::StartTicket() { - if (!firstFree_) { - void *p{AllocateMemoryOrCrash(terminator_, sizeof(TicketList))}; - firstFree_ = new (p) TicketList; - firstFree_->isStatic = false; - } - TicketList *newTicket{firstFree_}; - if ((firstFree_ = newTicket->next)) { - firstFree_->previous = nullptr; - } - TicketList *after{insertAfter_ ? insertAfter_->next : nullptr}; - if ((newTicket->previous = insertAfter_ ? insertAfter_ : last_)) { - newTicket->previous->next = newTicket; - } else { - first_ = newTicket; - } - if ((newTicket->next = after)) { - after->previous = newTicket; - } else { - last_ = newTicket; - } - newTicket->ticket.begun = false; -#if !defined(RT_DEVICE_COMPILATION) - if (enableDebugOutput && - (executionEnvironment.internalDebugging & - ExecutionEnvironment::WorkQueue)) { - std::fprintf(stderr, "WQ: new ticket\n"); - } -#endif - return newTicket->ticket; -} - -RT_API_ATTRS int WorkQueue::Run() { - while (last_) { - TicketList *at{last_}; - insertAfter_ = last_; -#if !defined(RT_DEVICE_COMPILATION) - if (enableDebugOutput && - (executionEnvironment.internalDebugging & - ExecutionEnvironment::WorkQueue)) { - std::fprintf(stderr, "WQ: %zd %s\n", at->ticket.u.index(), - at->ticket.begun ? "Continue" : "Begin"); - } -#endif - int stat{at->ticket.Continue(*this)}; -#if !defined(RT_DEVICE_COMPILATION) - if (enableDebugOutput && - (executionEnvironment.internalDebugging & - ExecutionEnvironment::WorkQueue)) { - std::fprintf(stderr, "WQ: ... stat %d\n", stat); - } -#endif - insertAfter_ = nullptr; - if (stat == StatOk) { - if (at->previous) { - at->previous->next = at->next; - } else { - first_ = at->next; - } - if (at->next) { - at->next->previous = at->previous; - } else { - last_ = at->previous; - } - if ((at->next = firstFree_)) { - at->next->previous = at; - } - at->previous = nullptr; - firstFree_ = at; - } else if (stat != StatContinue) { - Stop(); - return stat; - } - } - return StatOk; -} - -RT_API_ATTRS void WorkQueue::Stop() { - if (last_) { - if ((last_->next = firstFree_)) { - last_->next->previous = last_; - } - firstFree_ = first_; - first_ = last_ = nullptr; - } -} - -RT_OFFLOAD_API_GROUP_END - -} // namespace Fortran::runtime diff --git a/flang-rt/unittests/Runtime/ExternalIOTest.cpp b/flang-rt/unittests/Runtime/ExternalIOTest.cpp index 6c148b1de6f82..3833e48be3dd6 100644 --- a/flang-rt/unittests/Runtime/ExternalIOTest.cpp +++ b/flang-rt/unittests/Runtime/ExternalIOTest.cpp @@ -184,7 +184,7 @@ TEST(ExternalIOTests, TestSequentialFixedUnformatted) { io = IONAME(BeginInquireIoLength)(__FILE__, __LINE__); for (int j{1}; j <= 3; ++j) { ASSERT_TRUE(IONAME(OutputDescriptor)(io, desc)) - << "OutputDescriptor() for InquireIoLength " << j; + << "OutputDescriptor() for InquireIoLength"; } ASSERT_EQ(IONAME(GetIoLength)(io), 3 * recl) << "GetIoLength"; ASSERT_EQ(IONAME(EndIoStatement)(io), IostatOk) diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md index 871749934810c..78d871c593e1d 100644 --- a/flang/docs/Extensions.md +++ b/flang/docs/Extensions.md @@ -858,16 +858,6 @@ print *, [(j,j=1,10)] warning since such values may have become defined by the time the nested expression's value is required. -* Intrinsic assignment of arrays is defined elementally, and intrinsic - assignment of derived type components is defined componentwise. - However, when intrinsic assignment takes place for an array of derived - type, the order of the loop nesting is not defined. - Some compilers will loop over the elements, assigning all of the components - of each element before proceeding to the next element. - This compiler loops over all of the components, and assigns all of - the elements for each component before proceeding to the next component. - A program using defined assignment might be able to detect the difference. - ## De Facto Standard Features * `EXTENDS_TYPE_OF()` returns `.TRUE.` if both of its arguments have the diff --git a/flang/include/flang/Runtime/assign.h b/flang/include/flang/Runtime/assign.h index eb1f63184a177..bc80997a1bec2 100644 --- a/flang/include/flang/Runtime/assign.h +++ b/flang/include/flang/Runtime/assign.h @@ -38,7 +38,7 @@ enum AssignFlags { ComponentCanBeDefinedAssignment = 1 << 3, ExplicitLengthCharacterLHS = 1 << 4, PolymorphicLHS = 1 << 5, - DeallocateLHS = 1 << 6, + DeallocateLHS = 1 << 6 }; #ifdef RT_DEVICE_COMPILATION diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h index 51df7c40f5b8b..4b2bb4fa167f8 100644 --- a/flang/include/flang/Semantics/tools.h +++ b/flang/include/flang/Semantics/tools.h @@ -182,12 +182,9 @@ const Symbol *HasImpureFinal( const Symbol &, std::optional rank = std::nullopt); // Is this type finalizable or does it contain any polymorphic allocatable // ultimate components? -bool MayRequireFinalization(const DerivedTypeSpec &); +bool MayRequireFinalization(const DerivedTypeSpec &derived); // Does this type have an allocatable direct component? -bool HasAllocatableDirectComponent(const DerivedTypeSpec &); -// Does this type have any defined assignment at any level (or any polymorphic -// allocatable)? -bool MayHaveDefinedAssignment(const DerivedTypeSpec &); +bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived); bool IsInBlankCommon(const Symbol &); bool IsAssumedLengthCharacter(const Symbol &); diff --git a/flang/lib/Semantics/runtime-type-info.cpp b/flang/lib/Semantics/runtime-type-info.cpp index 4c186f4874152..26ae81f97895a 100644 --- a/flang/lib/Semantics/runtime-type-info.cpp +++ b/flang/lib/Semantics/runtime-type-info.cpp @@ -661,10 +661,6 @@ const Symbol *RuntimeTableBuilder::DescribeType( AddValue(dtValues, derivedTypeSchema_, "nofinalizationneeded"s, IntExpr<1>( derivedTypeSpec && !MayRequireFinalization(*derivedTypeSpec))); - // Similarly, a flag to enable optimized runtime assignment. - AddValue(dtValues, derivedTypeSchema_, "nodefinedassignment"s, - IntExpr<1>( - derivedTypeSpec && !MayHaveDefinedAssignment(*derivedTypeSpec))); } dtObject.get().set_init(MaybeExpr{ StructureExpr(Structure(derivedTypeSchema_, std::move(dtValues)))}); diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp index ea5ab2d455b54..ac69e6ff5cb79 100644 --- a/flang/lib/Semantics/tools.cpp +++ b/flang/lib/Semantics/tools.cpp @@ -813,38 +813,6 @@ bool HasAllocatableDirectComponent(const DerivedTypeSpec &derived) { return std::any_of(directs.begin(), directs.end(), IsAllocatable); } -static bool MayHaveDefinedAssignment( - const DerivedTypeSpec &derived, std::set &checked) { - if (const Scope *scope{derived.GetScope()}; - scope && checked.find(scope) == checked.end()) { - checked.insert(scope); - for (const auto &[_, symbolRef] : *scope) { - if (const auto *generic{symbolRef->detailsIf()}) { - if (generic->kind().IsAssignment()) { - return true; - } - } else if (symbolRef->has() && - !IsPointer(*symbolRef)) { - if (const DeclTypeSpec *type{symbolRef->GetType()}) { - if (type->IsPolymorphic()) { - return true; - } else if (const DerivedTypeSpec *derived{type->AsDerived()}) { - if (MayHaveDefinedAssignment(*derived, checked)) { - return true; - } - } - } - } - } - } - return false; -} - -bool MayHaveDefinedAssignment(const DerivedTypeSpec &derived) { - std::set checked; - return MayHaveDefinedAssignment(derived, checked); -} - bool IsAssumedLengthCharacter(const Symbol &symbol) { if (const DeclTypeSpec * type{symbol.GetType()}) { return type->category() == DeclTypeSpec::Character && diff --git a/flang/module/__fortran_type_info.f90 b/flang/module/__fortran_type_info.f90 index 7226b06504d28..b30a6bf697563 100644 --- a/flang/module/__fortran_type_info.f90 +++ b/flang/module/__fortran_type_info.f90 @@ -52,8 +52,7 @@ integer(1) :: noInitializationNeeded ! 1 if no component w/ init integer(1) :: noDestructionNeeded ! 1 if no component w/ dealloc/final integer(1) :: noFinalizationNeeded ! 1 if nothing finalizeable - integer(1) :: noDefinedAssignment ! 1 if no defined ASSIGNMENT(=) - integer(1) :: __padding0(3) + integer(1) :: __padding0(4) end type type :: Binding diff --git a/flang/test/Lower/volatile-openmp.f90 b/flang/test/Lower/volatile-openmp.f90 index 2e05b652822b5..28f0bf78f33c9 100644 --- a/flang/test/Lower/volatile-openmp.f90 +++ b/flang/test/Lower/volatile-openmp.f90 @@ -23,11 +23,11 @@ ! CHECK: %[[VAL_11:.*]] = fir.address_of(@_QFEcontainer) : !fir.ref>>}>> ! CHECK: %[[VAL_12:.*]] = fir.volatile_cast %[[VAL_11]] : (!fir.ref>>}>>) -> !fir.ref>>}>, volatile> ! CHECK: %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_12]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFEcontainer"} : (!fir.ref>>}>, volatile>) -> (!fir.ref>>}>, volatile>, !fir.ref>>}>, volatile>) -! CHECK: %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>> +! CHECK: %[[VAL_14:.*]] = fir.address_of(@_QFE.c.t) : !fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>> ! CHECK: %[[VAL_15:.*]] = fir.shape_shift %[[VAL_0]], %[[VAL_1]] : (index, index) -> !fir.shapeshift<1> -! CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QFE.c.t"} : (!fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>) -! CHECK: %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>> -! CHECK: %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFE.dt.t"} : (!fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) -> (!fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>, !fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,nodefinedassignment:i8,__padding0:!fir.array<3xi8>}>>) +! CHECK: %[[VAL_16:.*]]:2 = hlfir.declare %[[VAL_14]](%[[VAL_15]]) {fortran_attrs = #fir.var_attrs, uniq_name = "_QFE.c.t"} : (!fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.shapeshift<1>) -> (!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>, !fir.ref>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>) +! CHECK: %[[VAL_17:.*]] = fir.address_of(@_QFE.dt.t) : !fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>> +! CHECK: %[[VAL_18:.*]]:2 = hlfir.declare %[[VAL_17]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFE.dt.t"} : (!fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>) -> (!fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>, !fir.ref,name:!fir.box>>}>>>>,name:!fir.box>>,sizeinbytes:i64,uninstantiated:!fir.box>>,kindparameter:!fir.box>>,lenparameterkind:!fir.box>>,component:!fir.box>>,genre:i8,category:i8,kind:i8,rank:i8,__padding0:!fir.array<4xi8>,offset:i64,characterlen:!fir.type<_QM__fortran_type_infoTvalue{{[<]?}}{genre:i8,__padding0:!fir.array<7xi8>,value:i64}{{[>]?}}>,derived:!fir.box>>,lenvalue:!fir.box,value:i64}{{[>]?}}>>>>,bounds:!fir.box,value:i64}{{[>]?}}>>>>,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_ptr{__address:i64}>}>>>>,procptr:!fir.box>>,offset:i64,initialization:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}>>>>,special:!fir.box,proc:!fir.type<_QM__fortran_builtinsT__builtin_c_funptr{__address:i64}>}{{[>]?}}>>>>,specialbitset:i32,hasparent:i8,noinitializationneeded:i8,nodestructionneeded:i8,nofinalizationneeded:i8,__padding0:!fir.array<4xi8>}>>) ! CHECK: %[[VAL_19:.*]] = hlfir.designate %[[VAL_13]]#0{"array"} {fortran_attrs = #fir.var_attrs} : (!fir.ref>>}>, volatile>) -> !fir.ref>>, volatile> ! CHECK: %[[VAL_20:.*]] = fir.load %[[VAL_19]] : !fir.ref>>, volatile> ! CHECK: %[[VAL_21:.*]]:3 = fir.box_dims %[[VAL_20]], %[[VAL_0]] : (!fir.box>>, index) -> (index, index, index) diff --git a/flang/test/Semantics/typeinfo01.f90 b/flang/test/Semantics/typeinfo01.f90 index 7dc92504aeebf..d228cd2a84ca4 100644 --- a/flang/test/Semantics/typeinfo01.f90 +++ b/flang/test/Semantics/typeinfo01.f90 @@ -8,7 +8,7 @@ module m01 end type !CHECK: Module scope: m01 !CHECK: .c.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.n,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())] -!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) !CHECK: .n.n, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"n" !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1" !CHECK: DerivedType scope: t1 @@ -23,8 +23,8 @@ module m02 end type !CHECK: .c.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:1_8 init:[component::component(name=.n.parent,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.parent,lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.cn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=4_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())] !CHECK: .c.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.pn,genre=1_1,category=0_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())] -!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) -!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.child, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.child,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.child,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.parent, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.parent,sizeinbytes=4_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.parent,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) end module module m03 @@ -35,7 +35,7 @@ module m03 type(kpdt(4)) :: x !CHECK: .c.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.a,genre=1_1,category=2_1,kind=4_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL())] !CHECK: .dt.kpdt, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.kpdt,uninstantiated=NULL(),kindparameter=.kp.kpdt,lenparameterkind=NULL()) -!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.kpdt,sizeinbytes=4_8,uninstantiated=.dt.kpdt,kindparameter=.kp.kpdt.4,lenparameterkind=NULL(),component=.c.kpdt.4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) !CHECK: .kp.kpdt.4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(8) shape: 0_8:0_8 init:[INTEGER(8)::4_8] end module @@ -49,7 +49,7 @@ module m04 subroutine s1(x) class(tbps), intent(in) :: x end subroutine -!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.tbps,name=.n.tbps,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) !CHECK: .v.tbps, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:1_8 init:[binding::binding(proc=s1,name=.n.b1),binding(proc=s1,name=.n.b2)] end module @@ -61,7 +61,7 @@ module m05 subroutine s1(x) class(t), intent(in) :: x end subroutine -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=8_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=.p.t,special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) !CHECK: .p.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(procptrcomponent) shape: 0_8:0_8 init:[procptrcomponent::procptrcomponent(name=.n.p1,offset=0_8,initialization=s1)] end module @@ -85,8 +85,8 @@ subroutine s2(x, y) class(t), intent(in) :: y end subroutine !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())] -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) -!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)] !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)] !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)] @@ -113,8 +113,8 @@ subroutine s2(x, y) class(t2), intent(in) :: y end subroutine !CHECK: .c.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=.dt.t,lenvalue=NULL(),bounds=NULL(),initialization=NULL())] -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) -!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=2_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) +!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t2,name=.n.t2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=.s.t2,specialbitset=2_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)] !CHECK: .s.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=1_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s2)] !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)] @@ -132,7 +132,7 @@ impure elemental subroutine s1(x, y) class(t), intent(out) :: x class(t), intent(in) :: y end subroutine -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:0_8 init:[specialbinding::specialbinding(which=2_1,isargdescriptorset=3_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1)] !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:0_8 init:[binding::binding(proc=s1,name=.n.s1)] end module @@ -155,7 +155,7 @@ impure elemental subroutine s3(x) subroutine s4(x) type(t), contiguous :: x(:,:,:) end subroutine -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=7296_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=7_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=0_1,proc=s3),specialbinding(which=10_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=s1),specialbinding(which=11_1,isargdescriptorset=0_1,istypebound=1_1,isargcontiguousset=1_1,proc=s2),specialbinding(which=12_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=1_1,proc=s4)] end module @@ -197,7 +197,7 @@ subroutine wu(x,u,iostat,iomsg) integer, intent(out) :: iostat character(len=*), intent(inout) :: iomsg end subroutine -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.t,name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=1_1,istypebound=1_1,isargcontiguousset=0_1,proc=wu)] !CHECK: .v.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(binding) shape: 0_8:3_8 init:[binding::binding(proc=rf,name=.n.rf),binding(proc=ru,name=.n.ru),binding(proc=wf,name=.n.wf),binding(proc=wu,name=.n.wu)] end module @@ -246,7 +246,7 @@ subroutine wu(x,u,iostat,iomsg) integer, intent(out) :: iostat character(len=*), intent(inout) :: iomsg end subroutine -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.t,specialbitset=120_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) !CHECK: .s.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(specialbinding) shape: 0_8:3_8 init:[specialbinding::specialbinding(which=3_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=rf),specialbinding(which=4_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=ru),specialbinding(which=5_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wf),specialbinding(which=6_1,isargdescriptorset=0_1,istypebound=0_1,isargcontiguousset=0_1,proc=wu)] end module @@ -263,7 +263,7 @@ module m11 !CHECK: .c.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:3_8 init:[component::component(name=.n.allocatable,genre=3_1,category=2_1,kind=4_1,rank=1_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.pointer,genre=2_1,category=2_1,kind=4_1,rank=0_1,offset=48_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=.di.t.pointer),component(name=.n.chauto,genre=4_1,category=4_1,kind=1_1,rank=0_1,offset=72_8,characterlen=value(genre=3_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=NULL(),initialization=NULL()),component(name=.n.automatic,genre=4_1,category=2_1,kind=4_1,rank=1_1,offset=96_8,characterlen=value(genre=1_1,value=0_8),derived=NULL(),lenvalue=NULL(),bounds=.b.t.automatic,initialization=NULL())] !CHECK: .di.t.pointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(.dp.t.pointer) init:.dp.t.pointer(pointer=target) !CHECK: .dp.t.pointer (CompilerCreated): DerivedType components: pointer -!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t,sizeinbytes=144_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.t,component=.c.t,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1) !CHECK: .lpk.t, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::8_1] !CHECK: DerivedType scope: .dp.t.pointer size=24 alignment=8 instantiation of .dp.t.pointer !CHECK: pointer, POINTER size=24 offset=0: ObjectEntity type: REAL(4) diff --git a/flang/test/Semantics/typeinfo03.f90 b/flang/test/Semantics/typeinfo03.f90 index e2552d0a21d6f..f0c0a817da4a4 100644 --- a/flang/test/Semantics/typeinfo03.f90 +++ b/flang/test/Semantics/typeinfo03.f90 @@ -6,4 +6,4 @@ module m class(*), pointer :: sp, ap(:) end type end module -!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.haspointer, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.haspointer,sizeinbytes=104_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.haspointer,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) diff --git a/flang/test/Semantics/typeinfo04.f90 b/flang/test/Semantics/typeinfo04.f90 index 94dd2199db35a..de8464321a409 100644 --- a/flang/test/Semantics/typeinfo04.f90 +++ b/flang/test/Semantics/typeinfo04.f90 @@ -7,18 +7,18 @@ module m contains final :: final end type -!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1) +!CHECK: .dt.finalizable, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.finalizable,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.finalizable,specialbitset=128_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) type, abstract :: t1 end type -!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) type, abstract :: t2 real, allocatable :: a(:) end type -!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t2,sizeinbytes=48_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1) type, abstract :: t3 type(finalizable) :: x end type -!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=1_1) +!CHECK: .dt.t3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(name=.n.t3,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t3,procptr=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) contains impure elemental subroutine final(x) type(finalizable), intent(in out) :: x diff --git a/flang/test/Semantics/typeinfo05.f90 b/flang/test/Semantics/typeinfo05.f90 index df1aecf3821de..2a7f12a153eb8 100644 --- a/flang/test/Semantics/typeinfo05.f90 +++ b/flang/test/Semantics/typeinfo05.f90 @@ -7,10 +7,10 @@ program main type t1 type(t2), pointer :: b end type t1 -!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) type :: t2 type(t1) :: a end type t2 -! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) end program main diff --git a/flang/test/Semantics/typeinfo06.f90 b/flang/test/Semantics/typeinfo06.f90 index 22f37b1a4369d..2385709a8eb44 100644 --- a/flang/test/Semantics/typeinfo06.f90 +++ b/flang/test/Semantics/typeinfo06.f90 @@ -7,10 +7,10 @@ program main type t1 type(t2), allocatable :: b end type t1 -!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t1,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1) type :: t2 type(t1) :: a end type t2 -! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +! CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1) end program main diff --git a/flang/test/Semantics/typeinfo07.f90 b/flang/test/Semantics/typeinfo07.f90 index ab20d6f601106..e8766d9811db8 100644 --- a/flang/test/Semantics/typeinfo07.f90 +++ b/flang/test/Semantics/typeinfo07.f90 @@ -16,7 +16,7 @@ type(t_container_extension) :: wrapper end type end -! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1) -! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1) -! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) -! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1) +! CHECK: .dt.t_container, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) +! CHECK: .dt.t_container_extension, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) +! CHECK: .dt.t_container_not_polymorphic, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1) +! CHECK: .dt.t_container_wrapper, SAVE, TARGET (CompilerCreated, ReadOnly): {{.*}}noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) diff --git a/flang/test/Semantics/typeinfo08.f90 b/flang/test/Semantics/typeinfo08.f90 index 391a66f3d6664..689cf469dee3b 100644 --- a/flang/test/Semantics/typeinfo08.f90 +++ b/flang/test/Semantics/typeinfo08.f90 @@ -13,7 +13,7 @@ module m !CHECK: Module scope: m size=0 alignment=1 sourceRange=113 bytes !CHECK: .c.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(component) shape: 0_8:0_8 init:[component::component(name=.n.t1,genre=1_1,category=6_1,kind=0_1,rank=0_1,offset=0_8,characterlen=value(genre=1_1,value=0_8),lenvalue=NULL(),bounds=NULL(),initialization=NULL())] -!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) +!CHECK: .dt.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.s,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=.lpk.s,component=.c.s,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1) !CHECK: .lpk.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: INTEGER(1) shape: 0_8:0_8 init:[INTEGER(1)::4_1] !CHECK: .n.s, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(1_8,1) init:"s" !CHECK: .n.t1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: CHARACTER(2_8,1) init:"t1" diff --git a/flang/test/Semantics/typeinfo11.f90 b/flang/test/Semantics/typeinfo11.f90 index 08e0b95abb763..92efc8f9ea54b 100644 --- a/flang/test/Semantics/typeinfo11.f90 +++ b/flang/test/Semantics/typeinfo11.f90 @@ -14,4 +14,4 @@ type(t2) x end -!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1) +!CHECK: .dt.t2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.t2,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.t2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1) diff --git a/flang/test/Semantics/typeinfo12.f90 b/flang/test/Semantics/typeinfo12.f90 deleted file mode 100644 index 6b23b63d28b1d..0000000000000 --- a/flang/test/Semantics/typeinfo12.f90 +++ /dev/null @@ -1,67 +0,0 @@ -!RUN: bbc --dump-symbols %s | FileCheck %s -!Check "nodefinedassignment" settings. - -module m01 - - type hasAsst1 - contains - procedure asst1 - generic :: assignment(=) => asst1 - end type -!CHECK: .dt.hasasst1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.hasasst1,name=.n.hasasst1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=.s.hasasst1,specialbitset=4_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) - - type hasAsst2 ! no defined assignment relevant to the runtime - end type - interface assignment(=) - procedure asst2 - end interface -!CHECK: .dt.hasasst2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.hasasst2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=NULL(),procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) - - type test1 - type(hasAsst1) c - end type -!CHECK: .dt.test1, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test1,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test1,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) - - type test2 - type(hasAsst2) c - end type -!CHECK: .dt.test2, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test2,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test2,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) - - type test3 - type(hasAsst1), pointer :: p - end type -!CHECK: .dt.test3, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test3,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test3,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) - - type test4 - type(hasAsst2), pointer :: p - end type -!CHECK: .dt.test4, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test4,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test4,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) - - type, extends(hasAsst1) :: test5 - end type -!CHECK: .dt.test5, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=.v.test5,name=.n.test5,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test5,procptr=NULL(),special=.s.test5,specialbitset=4_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=0_1) - - type, extends(hasAsst2) :: test6 - end type -!CHECK: .dt.test6, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test6,sizeinbytes=0_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test6,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=1_1,noinitializationneeded=1_1,nodestructionneeded=1_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) - - type test7 - type(test7), allocatable :: c - end type -!CHECK: .dt.test7, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test7,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test7,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=1_1,nodefinedassignment=1_1) - - type test8 - class(test8), allocatable :: c - end type -!CHECK: .dt.test8, SAVE, TARGET (CompilerCreated, ReadOnly): ObjectEntity type: TYPE(derivedtype) init:derivedtype(binding=NULL(),name=.n.test8,sizeinbytes=40_8,uninstantiated=NULL(),kindparameter=NULL(),lenparameterkind=NULL(),component=.c.test8,procptr=NULL(),special=NULL(),specialbitset=0_4,hasparent=0_1,noinitializationneeded=0_1,nodestructionneeded=0_1,nofinalizationneeded=0_1,nodefinedassignment=0_1) - - contains - impure elemental subroutine asst1(left, right) - class(hasAsst1), intent(out) :: left - class(hasAsst1), intent(in) :: right - end - impure elemental subroutine asst2(left, right) - class(hasAsst2), intent(out) :: left - class(hasAsst2), intent(in) :: right - end -end From 9150a8249f69930a9ed1e7e523555af9815876ec Mon Sep 17 00:00:00 2001 From: Igor Wodiany Date: Wed, 11 Jun 2025 15:59:47 +0100 Subject: [PATCH 089/851] [mlir][spirv] Add definition for GL Exp2 (#143678) --- .../mlir/Dialect/SPIRV/IR/SPIRVGLOps.td | 28 +++++++++++++++++++ mlir/test/Dialect/SPIRV/IR/gl-ops.mlir | 26 +++++++++++++++++ mlir/test/Target/SPIRV/gl-ops.mlir | 2 ++ 3 files changed, 56 insertions(+) diff --git a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td index 4c7186077fae0..f3f75240e5214 100644 --- a/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td +++ b/mlir/include/mlir/Dialect/SPIRV/IR/SPIRVGLOps.td @@ -838,6 +838,34 @@ def SPIRV_GLAtanhOp : SPIRV_GLUnaryArithmeticOp<"Atanh", 24, SPIRV_Float16or32> // ----- +def SPIRV_GLExp2Op : SPIRV_GLUnaryArithmeticOp<"Exp2", 29, SPIRV_Float16or32> { + let summary = "Result is 2 raised to the x power"; + + let description = [{ + Result is 2 raised to the x power; 2**x. + + ``` + exp2(Inf) = Inf. + exp2(-Inf) = +0. + ``` + + The operand x must be a scalar or vector whose component type is 16-bit or + 32-bit floating-point. + + Result Type and the type of x must be the same type. Results are computed + per component. + + #### Example: + + ```mlir + %2 = spirv.GL.Exp2 %0 : f32 + %3 = spirv.GL.Exp2 %1 : vector<3xf16> + ``` + }]; +} + +// ----- + def SPIRV_GLLog2Op : SPIRV_GLUnaryArithmeticOp<"Log2", 30, SPIRV_Float16or32> { let summary = "Result is the base-2 logarithm of x"; diff --git a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir index 50cf1b26d42ab..29beee5aea93c 100644 --- a/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir +++ b/mlir/test/Dialect/SPIRV/IR/gl-ops.mlir @@ -789,3 +789,29 @@ func.func @tanh_invalid_type(%arg0 : i32) -> () { %0 = spirv.GL.Tanh %arg0 : i32 return } + +// ----- + +//===----------------------------------------------------------------------===// +// spirv.GL.Exp2 +//===----------------------------------------------------------------------===// + +func.func @exp2(%arg0 : f32) -> () { + // CHECK: spirv.GL.Exp2 {{%.*}} : f32 + %0 = spirv.GL.Exp2 %arg0 : f32 + return +} + +func.func @exp2vec(%arg0 : vector<3xf16>) -> () { + // CHECK: spirv.GL.Exp2 {{%.*}} : vector<3xf16> + %0 = spirv.GL.Exp2 %arg0 : vector<3xf16> + return +} + +// ----- + +func.func @exp2_invalid_type(%arg0 : i32) -> () { + // expected-error @+1 {{op operand #0 must be 16/32-bit float or vector of 16/32-bit float values}} + %0 = spirv.GL.Exp2 %arg0 : i32 + return +} diff --git a/mlir/test/Target/SPIRV/gl-ops.mlir b/mlir/test/Target/SPIRV/gl-ops.mlir index 368f60e102dc1..3dee03345e9a1 100644 --- a/mlir/test/Target/SPIRV/gl-ops.mlir +++ b/mlir/test/Target/SPIRV/gl-ops.mlir @@ -44,6 +44,8 @@ spirv.module Logical GLSL450 requires #spirv.vce { %20 = spirv.GL.Log2 %arg0 : f32 // CHECK: {{%.*}} = spirv.GL.Tanh {{%.*}} : f32 %21 = spirv.GL.Tanh %arg0 : f32 + // CHECK: {{%.*}} = spirv.GL.Exp2 {{%.*}} : f32 + %22 = spirv.GL.Exp2 %arg0 : f32 spirv.Return } From 3ca6ea0f3aabcfba318ce9b14e4567f05de3b556 Mon Sep 17 00:00:00 2001 From: Shafik Yaghmour Date: Wed, 11 Jun 2025 08:02:44 -0700 Subject: [PATCH 090/851] [Clang][ByteCode][NFC] Move APInt into pushInteger since it is being passed by value (#143578) Static analysis flagged that we could move APInt instead of copy, indeed it has a move constructor and so we should move into values for APInt. --- clang/lib/AST/ByteCode/InterpBuiltin.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp index b678f229d50bb..5fc5034569597 100644 --- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp +++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp @@ -1321,7 +1321,7 @@ static bool interp__builtin_ia32_pdep(InterpState &S, CodePtr OpPC, if (Mask[I]) Result.setBitVal(I, Val[P++]); } - pushInteger(S, Result, Call->getType()); + pushInteger(S, std::move(Result), Call->getType()); return true; } @@ -1344,7 +1344,7 @@ static bool interp__builtin_ia32_pext(InterpState &S, CodePtr OpPC, if (Mask[I]) Result.setBitVal(P++, Val[I]); } - pushInteger(S, Result, Call->getType()); + pushInteger(S, std::move(Result), Call->getType()); return true; } From 141d390dcb6cd174b07ca663e58f37ab24eee08a Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Wed, 11 Jun 2025 10:05:34 -0500 Subject: [PATCH 091/851] [flang][OpenMP] Overhaul implementation of ATOMIC construct (#137852) The parser will accept a wide variety of illegal attempts at forming an ATOMIC construct, leaving it to the semantic analysis to diagnose any issues. This consolidates the analysis into one place and allows us to produce more informative diagnostics. The parser's outcome will be parser::OpenMPAtomicConstruct object holding the directive, parser::Body, and an optional end-directive. The prior variety of OmpAtomicXyz classes, as well as OmpAtomicClause have been removed. READ, WRITE, etc. are now proper clauses. The semantic analysis consistently operates on "evaluation" representations, mainly evaluate::Expr (as SomeExpr) and evaluate::Assignment. The results of the semantic analysis are stored in a mutable member of the OpenMPAtomicConstruct node. This follows a precedent of having `typedExpr` member in parser::Expr, for example. This allows the lowering code to avoid duplicated handling of AST nodes. Using a BLOCK construct containing multiple statements for an ATOMIC construct that requires multiple statements is now allowed. In fact, any nesting of such BLOCK constructs is allowed. This implementation will parse, and perform semantic checks for both conditional-update and conditional-update-capture, although no MLIR will be generated for those. Instead, a TODO error will be issues prior to lowering. The allowed forms of the ATOMIC construct were based on the OpenMP 6.0 spec. --- flang/docs/OpenMPSupport.md | 13 + flang/examples/FeatureList/FeatureList.cpp | 10 - .../FlangOmpReport/FlangOmpReportVisitor.cpp | 27 +- flang/include/flang/Parser/dump-parse-tree.h | 12 - flang/include/flang/Parser/parse-tree.h | 111 +- flang/include/flang/Semantics/tools.h | 147 ++ .../lib/Lower/OpenMP/DataSharingProcessor.cpp | 40 +- flang/lib/Lower/OpenMP/OpenMP.cpp | 1011 ++++----- flang/lib/Parser/openmp-parsers.cpp | 233 +- flang/lib/Parser/parse-tree.cpp | 28 + flang/lib/Parser/unparse.cpp | 102 +- flang/lib/Semantics/check-omp-structure.cpp | 1893 +++++++++++++---- flang/lib/Semantics/check-omp-structure.h | 59 +- flang/lib/Semantics/resolve-names.cpp | 7 +- flang/lib/Semantics/rewrite-directives.cpp | 126 +- flang/lib/Semantics/tools.cpp | 317 ++- flang/test/Examples/omp-atomic.f90 | 16 +- .../Lower/OpenMP/Todo/atomic-compare-fail.f90 | 2 +- .../test/Lower/OpenMP/Todo/atomic-compare.f90 | 2 +- flang/test/Lower/OpenMP/atomic-capture.f90 | 4 +- .../Lower/OpenMP/atomic-implicit-cast.f90 | 10 +- flang/test/Lower/OpenMP/atomic-privatize.f90 | 2 +- flang/test/Lower/OpenMP/atomic-write.f90 | 2 +- .../Lower/OpenMP/dump-atomic-analysis.f90 | 82 + flang/test/Parser/OpenMP/atomic-compare.f90 | 306 ++- flang/test/Parser/OpenMP/atomic-end.f90 | 63 + .../test/Semantics/OpenMP/atomic-compare.f90 | 29 +- .../Semantics/OpenMP/atomic-hint-clause.f90 | 23 +- flang/test/Semantics/OpenMP/atomic-read.f90 | 118 + .../OpenMP/atomic-update-capture.f90 | 77 + .../Semantics/OpenMP/atomic-update-only.f90 | 83 + .../OpenMP/atomic-update-overloaded-ops.f90 | 4 +- flang/test/Semantics/OpenMP/atomic-write.f90 | 81 + flang/test/Semantics/OpenMP/atomic.f90 | 31 +- flang/test/Semantics/OpenMP/atomic01.f90 | 221 +- flang/test/Semantics/OpenMP/atomic02.f90 | 47 +- flang/test/Semantics/OpenMP/atomic03.f90 | 51 +- flang/test/Semantics/OpenMP/atomic04.f90 | 99 +- flang/test/Semantics/OpenMP/atomic05.f90 | 12 +- .../Semantics/OpenMP/critical-hint-clause.f90 | 20 +- .../OpenMP/omp-atomic-assignment-stmt.f90 | 58 +- .../Semantics/OpenMP/requires-atomic01.f90 | 86 +- .../Semantics/OpenMP/requires-atomic02.f90 | 86 +- 43 files changed, 3753 insertions(+), 1998 deletions(-) create mode 100644 flang/test/Lower/OpenMP/dump-atomic-analysis.f90 create mode 100644 flang/test/Parser/OpenMP/atomic-end.f90 create mode 100644 flang/test/Semantics/OpenMP/atomic-read.f90 create mode 100644 flang/test/Semantics/OpenMP/atomic-update-capture.f90 create mode 100644 flang/test/Semantics/OpenMP/atomic-update-only.f90 create mode 100644 flang/test/Semantics/OpenMP/atomic-write.f90 diff --git a/flang/docs/OpenMPSupport.md b/flang/docs/OpenMPSupport.md index 7a4f95693a89c..c9f19c37fd7fa 100644 --- a/flang/docs/OpenMPSupport.md +++ b/flang/docs/OpenMPSupport.md @@ -60,3 +60,16 @@ Note : No distinction is made between the support in Parser/Semantics, MLIR, Low | target teams distribute parallel loop construct | P | device, reduction and dist_schedule clauses are not supported | | teams distribute parallel loop simd construct | P | reduction, dist_schedule, and linear clauses are not supported | | target teams distribute parallel loop simd construct | P | device, reduction, dist_schedule and linear clauses are not supported | + +## Extensions +### ATOMIC construct +The implementation of the ATOMIC construct follows OpenMP 6.0 with the following extensions: +- `x = x` is an allowed form of ATOMIC UPDATE. +This is motivated by the fact that the equivalent forms `x = x+0` or `x = x*1` are allowed. +- Explicit type conversions are allowed in ATOMIC READ, WRITE or UPDATE constructs, and in the capture statement in ATOMIC UPDATE CAPTURE. +The OpenMP spec requires intrinsic- or pointer-assignments, which include (as per the Fortran standard) implicit type conversions. Since such conversions need to be handled, allowing explicit conversions comes at no extra cost. +- A literal `.true.` or `.false.` is an allowed condition in ATOMIC UPDATE COMPARE. [1] +- A logical variable is an allowed form of the condition even if its value is not computed within the ATOMIC UPDATE COMPARE construct [1]. +- `expr equalop x` is an allowed condition in ATOMIC UPDATE COMPARE. [1] + +[1] Code generation for ATOMIC UPDATE COMPARE is not implemented yet. diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp index d1407cf0ef239..a36b8719e365d 100644 --- a/flang/examples/FeatureList/FeatureList.cpp +++ b/flang/examples/FeatureList/FeatureList.cpp @@ -445,13 +445,6 @@ struct NodeVisitor { READ_FEATURE(ObjectDecl) READ_FEATURE(OldParameterStmt) READ_FEATURE(OmpAlignedClause) - READ_FEATURE(OmpAtomic) - READ_FEATURE(OmpAtomicCapture) - READ_FEATURE(OmpAtomicCapture::Stmt1) - READ_FEATURE(OmpAtomicCapture::Stmt2) - READ_FEATURE(OmpAtomicRead) - READ_FEATURE(OmpAtomicUpdate) - READ_FEATURE(OmpAtomicWrite) READ_FEATURE(OmpBeginBlockDirective) READ_FEATURE(OmpBeginLoopDirective) READ_FEATURE(OmpBeginSectionsDirective) @@ -480,7 +473,6 @@ struct NodeVisitor { READ_FEATURE(OmpIterationOffset) READ_FEATURE(OmpIterationVector) READ_FEATURE(OmpEndAllocators) - READ_FEATURE(OmpEndAtomic) READ_FEATURE(OmpEndBlockDirective) READ_FEATURE(OmpEndCriticalDirective) READ_FEATURE(OmpEndLoopDirective) @@ -566,8 +558,6 @@ struct NodeVisitor { READ_FEATURE(OpenMPDeclareTargetConstruct) READ_FEATURE(OmpMemoryOrderType) READ_FEATURE(OmpMemoryOrderClause) - READ_FEATURE(OmpAtomicClause) - READ_FEATURE(OmpAtomicClauseList) READ_FEATURE(OmpAtomicDefaultMemOrderClause) READ_FEATURE(OpenMPFlushConstruct) READ_FEATURE(OpenMPLoopConstruct) diff --git a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp index bf66151d59950..feb7b4eced9e9 100644 --- a/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp +++ b/flang/examples/FlangOmpReport/FlangOmpReportVisitor.cpp @@ -74,25 +74,19 @@ SourcePosition OpenMPCounterVisitor::getLocation(const OpenMPConstruct &c) { // the directive field. [&](const auto &c) -> SourcePosition { const CharBlock &source{std::get<0>(c.t).source}; - return (parsing->allCooked().GetSourcePositionRange(source))->first; + return parsing->allCooked().GetSourcePositionRange(source)->first; }, [&](const OpenMPAtomicConstruct &c) -> SourcePosition { - return std::visit( - [&](const auto &o) -> SourcePosition { - const CharBlock &source{std::get(o.t).source}; - return parsing->allCooked() - .GetSourcePositionRange(source) - ->first; - }, - c.u); + const CharBlock &source{c.source}; + return parsing->allCooked().GetSourcePositionRange(source)->first; }, [&](const OpenMPSectionConstruct &c) -> SourcePosition { const CharBlock &source{c.source}; - return (parsing->allCooked().GetSourcePositionRange(source))->first; + return parsing->allCooked().GetSourcePositionRange(source)->first; }, [&](const OpenMPUtilityConstruct &c) -> SourcePosition { const CharBlock &source{c.source}; - return (parsing->allCooked().GetSourcePositionRange(source))->first; + return parsing->allCooked().GetSourcePositionRange(source)->first; }, }, c.u); @@ -157,14 +151,9 @@ std::string OpenMPCounterVisitor::getName(const OpenMPConstruct &c) { return normalize_construct_name(source.ToString()); }, [&](const OpenMPAtomicConstruct &c) -> std::string { - return std::visit( - [&](const auto &c) { - // Get source from the verbatim fields - const CharBlock &source{std::get(c.t).source}; - return "atomic-" + - normalize_construct_name(source.ToString()); - }, - c.u); + auto &dirSpec = std::get(c.t); + auto &dirName = std::get(dirSpec.t); + return normalize_construct_name(dirName.source.ToString()); }, [&](const OpenMPUtilityConstruct &c) -> std::string { const CharBlock &source{c.source}; diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h index df9278697346f..c6a5150a85a4c 100644 --- a/flang/include/flang/Parser/dump-parse-tree.h +++ b/flang/include/flang/Parser/dump-parse-tree.h @@ -532,15 +532,6 @@ class ParseTreeDumper { NODE(parser, OmpAtClause) NODE_ENUM(OmpAtClause, ActionTime) NODE_ENUM(OmpSeverityClause, Severity) - NODE(parser, OmpAtomic) - NODE(parser, OmpAtomicCapture) - NODE(OmpAtomicCapture, Stmt1) - NODE(OmpAtomicCapture, Stmt2) - NODE(parser, OmpAtomicCompare) - NODE(parser, OmpAtomicCompareIfStmt) - NODE(parser, OmpAtomicRead) - NODE(parser, OmpAtomicUpdate) - NODE(parser, OmpAtomicWrite) NODE(parser, OmpBeginBlockDirective) NODE(parser, OmpBeginLoopDirective) NODE(parser, OmpBeginSectionsDirective) @@ -587,7 +578,6 @@ class ParseTreeDumper { NODE(parser, OmpDoacrossClause) NODE(parser, OmpDestroyClause) NODE(parser, OmpEndAllocators) - NODE(parser, OmpEndAtomic) NODE(parser, OmpEndBlockDirective) NODE(parser, OmpEndCriticalDirective) NODE(parser, OmpEndLoopDirective) @@ -716,8 +706,6 @@ class ParseTreeDumper { NODE(parser, OpenMPDeclareMapperConstruct) NODE_ENUM(common, OmpMemoryOrderType) NODE(parser, OmpMemoryOrderClause) - NODE(parser, OmpAtomicClause) - NODE(parser, OmpAtomicClauseList) NODE(parser, OmpAtomicDefaultMemOrderClause) NODE(parser, OpenMPDepobjConstruct) NODE(parser, OpenMPUtilityConstruct) diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h index c99006f0c1c22..67405f88e09f2 100644 --- a/flang/include/flang/Parser/parse-tree.h +++ b/flang/include/flang/Parser/parse-tree.h @@ -4857,94 +4857,37 @@ struct OmpMemoryOrderClause { CharBlock source; }; -// 2.17.7 Atomic construct -// atomic-clause -> memory-order-clause | HINT(hint-expression) | -// FAIL(memory-order) -struct OmpAtomicClause { - UNION_CLASS_BOILERPLATE(OmpAtomicClause); - CharBlock source; - std::variant u; -}; - -// atomic-clause-list -> [atomic-clause, [atomic-clause], ...] -struct OmpAtomicClauseList { - WRAPPER_CLASS_BOILERPLATE(OmpAtomicClauseList, std::list); - CharBlock source; -}; - -// END ATOMIC -EMPTY_CLASS(OmpEndAtomic); - -// ATOMIC READ -struct OmpAtomicRead { - TUPLE_CLASS_BOILERPLATE(OmpAtomicRead); - CharBlock source; - std::tuple, std::optional> - t; -}; - -// ATOMIC WRITE -struct OmpAtomicWrite { - TUPLE_CLASS_BOILERPLATE(OmpAtomicWrite); - CharBlock source; - std::tuple, std::optional> - t; -}; - -// ATOMIC UPDATE -struct OmpAtomicUpdate { - TUPLE_CLASS_BOILERPLATE(OmpAtomicUpdate); - CharBlock source; - std::tuple, std::optional> - t; -}; - -// ATOMIC CAPTURE -struct OmpAtomicCapture { - TUPLE_CLASS_BOILERPLATE(OmpAtomicCapture); - CharBlock source; - WRAPPER_CLASS(Stmt1, Statement); - WRAPPER_CLASS(Stmt2, Statement); - std::tuple - t; -}; - -struct OmpAtomicCompareIfStmt { - UNION_CLASS_BOILERPLATE(OmpAtomicCompareIfStmt); - std::variant, common::Indirection> u; -}; - -// ATOMIC COMPARE (OpenMP 5.1, OPenMP 5.2 spec: 15.8.4) -struct OmpAtomicCompare { - TUPLE_CLASS_BOILERPLATE(OmpAtomicCompare); +struct OpenMPAtomicConstruct { + llvm::omp::Clause GetKind() const; + bool IsCapture() const; + bool IsCompare() const; + TUPLE_CLASS_BOILERPLATE(OpenMPAtomicConstruct); CharBlock source; - std::tuple> + std::tuple> t; -}; -// ATOMIC -struct OmpAtomic { - TUPLE_CLASS_BOILERPLATE(OmpAtomic); - CharBlock source; - std::tuple, - std::optional> - t; -}; + // Information filled out during semantic checks to avoid duplication + // of analyses. + struct Analysis { + static constexpr int None = 0; + static constexpr int Read = 1; + static constexpr int Write = 2; + static constexpr int Update = Read | Write; + static constexpr int Action = 3; // Bitmask for None, Read, Write, Update + static constexpr int IfTrue = 4; + static constexpr int IfFalse = 8; + static constexpr int Condition = 12; // Bitmask for IfTrue, IfFalse + + struct Op { + int what; + AssignmentStmt::TypedAssignment assign; + }; + TypedExpr atom, cond; + Op op0, op1; + }; -// 2.17.7 atomic -> -// ATOMIC [atomic-clause-list] atomic-construct [atomic-clause-list] | -// ATOMIC [atomic-clause-list] -// atomic-construct -> READ | WRITE | UPDATE | CAPTURE | COMPARE -struct OpenMPAtomicConstruct { - UNION_CLASS_BOILERPLATE(OpenMPAtomicConstruct); - std::variant - u; + mutable Analysis analysis; }; // OpenMP directives that associate with loop(s) diff --git a/flang/include/flang/Semantics/tools.h b/flang/include/flang/Semantics/tools.h index 4b2bb4fa167f8..b13370512e5cc 100644 --- a/flang/include/flang/Semantics/tools.h +++ b/flang/include/flang/Semantics/tools.h @@ -755,5 +755,152 @@ bool HadUseError(SemanticsContext &, SourceName at, const Symbol *); // Checks whether the symbol on the LHS is present in the RHS expression. bool CheckForSymbolMatch(const SomeExpr *lhs, const SomeExpr *rhs); + +namespace operation { + +enum class Operator { + Unknown, + Add, + And, + Associated, + Call, + Constant, + Convert, + Div, + Eq, + Eqv, + False, + Ge, + Gt, + Identity, + Intrinsic, + Le, + Lt, + Max, + Min, + Mul, + Ne, + Neqv, + Not, + Or, + Pow, + Resize, // Convert within the same TypeCategory + Sub, + True, +}; + +std::string ToString(Operator op); + +template +Operator OperationCode( + const evaluate::Operation, Ts...> &op) { + switch (op.derived().logicalOperator) { + case common::LogicalOperator::And: + return Operator::And; + case common::LogicalOperator::Or: + return Operator::Or; + case common::LogicalOperator::Eqv: + return Operator::Eqv; + case common::LogicalOperator::Neqv: + return Operator::Neqv; + case common::LogicalOperator::Not: + return Operator::Not; + } + return Operator::Unknown; +} + +template +Operator OperationCode( + const evaluate::Operation, Ts...> &op) { + switch (op.derived().opr) { + case common::RelationalOperator::LT: + return Operator::Lt; + case common::RelationalOperator::LE: + return Operator::Le; + case common::RelationalOperator::EQ: + return Operator::Eq; + case common::RelationalOperator::NE: + return Operator::Ne; + case common::RelationalOperator::GE: + return Operator::Ge; + case common::RelationalOperator::GT: + return Operator::Gt; + } + return Operator::Unknown; +} + +template +Operator OperationCode(const evaluate::Operation, Ts...> &op) { + return Operator::Add; +} + +template +Operator OperationCode( + const evaluate::Operation, Ts...> &op) { + return Operator::Sub; +} + +template +Operator OperationCode( + const evaluate::Operation, Ts...> &op) { + return Operator::Mul; +} + +template +Operator OperationCode( + const evaluate::Operation, Ts...> &op) { + return Operator::Div; +} + +template +Operator OperationCode( + const evaluate::Operation, Ts...> &op) { + return Operator::Pow; +} + +template +Operator OperationCode( + const evaluate::Operation, Ts...> &op) { + return Operator::Pow; +} + +template +Operator OperationCode( + const evaluate::Operation, Ts...> &op) { + if constexpr (C == T::category) { + return Operator::Resize; + } else { + return Operator::Convert; + } +} + +template // +Operator OperationCode(const evaluate::Constant &x) { + return Operator::Constant; +} + +template // +Operator OperationCode(const T &) { + return Operator::Unknown; +} + +Operator OperationCode(const evaluate::ProcedureDesignator &proc); + +} // namespace operation + +/// Return information about the top-level operation (ignoring parentheses): +/// the operation code and the list of arguments. +std::pair> GetTopLevelOperation( + const SomeExpr &expr); + +/// Check if expr is same as x, or a sequence of Convert operations on x. +bool IsSameOrConvertOf(const SomeExpr &expr, const SomeExpr &x); + +/// Strip away any top-level Convert operations (if any exist) and return +/// the input value. A ComplexConstructor(x, 0) is also considered as a +/// convert operation. +/// If the input is not Operation, Designator, FunctionRef or Constant, +/// it returns std::nullopt. +MaybeExpr GetConvertInput(const SomeExpr &x); } // namespace Fortran::semantics #endif // FORTRAN_SEMANTICS_TOOLS_H_ diff --git a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp index f8c68bfc3056a..1b8670b379f82 100644 --- a/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp +++ b/flang/lib/Lower/OpenMP/DataSharingProcessor.cpp @@ -356,26 +356,26 @@ getSource(const semantics::SemanticsContext &semaCtx, const parser::CharBlock *source = nullptr; auto ompConsVisit = [&](const parser::OpenMPConstruct &x) { - std::visit(common::visitors{ - [&](const parser::OpenMPSectionsConstruct &x) { - source = &std::get<0>(x.t).source; - }, - [&](const parser::OpenMPLoopConstruct &x) { - source = &std::get<0>(x.t).source; - }, - [&](const parser::OpenMPBlockConstruct &x) { - source = &std::get<0>(x.t).source; - }, - [&](const parser::OpenMPCriticalConstruct &x) { - source = &std::get<0>(x.t).source; - }, - [&](const parser::OpenMPAtomicConstruct &x) { - std::visit([&](const auto &x) { source = &x.source; }, - x.u); - }, - [&](const auto &x) { source = &x.source; }, - }, - x.u); + std::visit( + common::visitors{ + [&](const parser::OpenMPSectionsConstruct &x) { + source = &std::get<0>(x.t).source; + }, + [&](const parser::OpenMPLoopConstruct &x) { + source = &std::get<0>(x.t).source; + }, + [&](const parser::OpenMPBlockConstruct &x) { + source = &std::get<0>(x.t).source; + }, + [&](const parser::OpenMPCriticalConstruct &x) { + source = &std::get<0>(x.t).source; + }, + [&](const parser::OpenMPAtomicConstruct &x) { + source = &std::get(x.t).source; + }, + [&](const auto &x) { source = &x.source; }, + }, + x.u); }; eval.visit(common::visitors{ diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 784749bba5a0c..3f3b85696db31 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -41,10 +41,13 @@ #include "mlir/Transforms/RegionUtils.h" #include "llvm/ADT/STLExtras.h" #include "llvm/Frontend/OpenMP/OMPConstants.h" +#include "llvm/Support/CommandLine.h" using namespace Fortran::lower::omp; using namespace Fortran::common::openmp; +static llvm::cl::opt DumpAtomicAnalysis("fdebug-dump-atomic-analysis"); + //===----------------------------------------------------------------------===// // Code generation helper functions //===----------------------------------------------------------------------===// @@ -1122,6 +1125,16 @@ markDeclareTarget(mlir::Operation *op, lower::AbstractConverter &converter, declareTargetOp.setDeclareTarget(deviceType, captureClause); } +static bool isPointerAssignment(const evaluate::Assignment &assign) { + return common::visit( + common::visitors{ + [](const evaluate::Assignment::BoundsSpec &) { return true; }, + [](const evaluate::Assignment::BoundsRemapping &) { return true; }, + [](const auto &) { return false; }, + }, + assign.u); +} + //===----------------------------------------------------------------------===// // Op body generation helper structures and functions //===----------------------------------------------------------------------===// @@ -2676,645 +2689,215 @@ genTeamsOp(lower::AbstractConverter &converter, lower::SymMap &symTable, //===----------------------------------------------------------------------===// // Code generation for atomic operations //===----------------------------------------------------------------------===// +static fir::FirOpBuilder::InsertPoint +getInsertionPointBefore(mlir::Operation *op) { + return fir::FirOpBuilder::InsertPoint(op->getBlock(), + mlir::Block::iterator(op)); +} -/// Populates \p hint and \p memoryOrder with appropriate clause information -/// if present on atomic construct. -static void genOmpAtomicHintAndMemoryOrderClauses( - lower::AbstractConverter &converter, - const parser::OmpAtomicClauseList &clauseList, mlir::IntegerAttr &hint, - mlir::omp::ClauseMemoryOrderKindAttr &memoryOrder) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - for (const parser::OmpAtomicClause &clause : clauseList.v) { - common::visit( - common::visitors{ - [&](const parser::OmpMemoryOrderClause &s) { - auto kind = common::visit( - common::visitors{ - [&](const parser::OmpClause::AcqRel &) { - return mlir::omp::ClauseMemoryOrderKind::Acq_rel; - }, - [&](const parser::OmpClause::Acquire &) { - return mlir::omp::ClauseMemoryOrderKind::Acquire; - }, - [&](const parser::OmpClause::Relaxed &) { - return mlir::omp::ClauseMemoryOrderKind::Relaxed; - }, - [&](const parser::OmpClause::Release &) { - return mlir::omp::ClauseMemoryOrderKind::Release; - }, - [&](const parser::OmpClause::SeqCst &) { - return mlir::omp::ClauseMemoryOrderKind::Seq_cst; - }, - [&](auto &&) -> mlir::omp::ClauseMemoryOrderKind { - llvm_unreachable("Unexpected clause"); - }, - }, - s.v.u); - memoryOrder = mlir::omp::ClauseMemoryOrderKindAttr::get( - firOpBuilder.getContext(), kind); - }, - [&](const parser::OmpHintClause &s) { - const auto *expr = semantics::GetExpr(s.v); - uint64_t hintExprValue = *evaluate::ToInt64(*expr); - hint = firOpBuilder.getI64IntegerAttr(hintExprValue); - }, - [&](const parser::OmpFailClause &) {}, - }, - clause.u); +static fir::FirOpBuilder::InsertPoint +getInsertionPointAfter(mlir::Operation *op) { + return fir::FirOpBuilder::InsertPoint(op->getBlock(), + ++mlir::Block::iterator(op)); +} + +static mlir::IntegerAttr getAtomicHint(lower::AbstractConverter &converter, + const List &clauses) { + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + for (const Clause &clause : clauses) { + if (clause.id != llvm::omp::Clause::OMPC_hint) + continue; + auto &hint = std::get(clause.u); + auto maybeVal = evaluate::ToInt64(hint.v); + CHECK(maybeVal); + return builder.getI64IntegerAttr(*maybeVal); } + return nullptr; } -static void processOmpAtomicTODO(mlir::Type elementType, mlir::Location loc) { - if (!elementType) - return; - assert(fir::isa_trivial(fir::unwrapRefType(elementType)) && - "is supported type for omp atomic"); -} - -/// Used to generate atomic.read operation which is created in existing -/// location set by builder. -static void genAtomicCaptureStatement( - lower::AbstractConverter &converter, mlir::Value fromAddress, - mlir::Value toAddress, - const parser::OmpAtomicClauseList *leftHandClauseList, - const parser::OmpAtomicClauseList *rightHandClauseList, - mlir::Type elementType, mlir::Location loc) { - // Generate `atomic.read` operation for atomic assigment statements - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); +static mlir::omp::ClauseMemoryOrderKindAttr +getAtomicMemoryOrder(lower::AbstractConverter &converter, + semantics::SemanticsContext &semaCtx, + const List &clauses) { + std::optional kind; + unsigned version = semaCtx.langOptions().OpenMPVersion; - processOmpAtomicTODO(elementType, loc); - - // If no hint clause is specified, the effect is as if - // hint(omp_sync_hint_none) had been specified. - mlir::IntegerAttr hint = nullptr; - - mlir::omp::ClauseMemoryOrderKindAttr memoryOrder = nullptr; - if (leftHandClauseList) - genOmpAtomicHintAndMemoryOrderClauses(converter, *leftHandClauseList, hint, - memoryOrder); - if (rightHandClauseList) - genOmpAtomicHintAndMemoryOrderClauses(converter, *rightHandClauseList, hint, - memoryOrder); - firOpBuilder.create(loc, fromAddress, toAddress, - mlir::TypeAttr::get(elementType), - hint, memoryOrder); -} - -/// Used to generate atomic.write operation which is created in existing -/// location set by builder. -static void genAtomicWriteStatement( - lower::AbstractConverter &converter, mlir::Value lhsAddr, - mlir::Value rhsExpr, const parser::OmpAtomicClauseList *leftHandClauseList, - const parser::OmpAtomicClauseList *rightHandClauseList, mlir::Location loc, - mlir::Value *evaluatedExprValue = nullptr) { - // Generate `atomic.write` operation for atomic assignment statements - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + for (const Clause &clause : clauses) { + switch (clause.id) { + case llvm::omp::Clause::OMPC_acq_rel: + kind = mlir::omp::ClauseMemoryOrderKind::Acq_rel; + break; + case llvm::omp::Clause::OMPC_acquire: + kind = mlir::omp::ClauseMemoryOrderKind::Acquire; + break; + case llvm::omp::Clause::OMPC_relaxed: + kind = mlir::omp::ClauseMemoryOrderKind::Relaxed; + break; + case llvm::omp::Clause::OMPC_release: + kind = mlir::omp::ClauseMemoryOrderKind::Release; + break; + case llvm::omp::Clause::OMPC_seq_cst: + kind = mlir::omp::ClauseMemoryOrderKind::Seq_cst; + break; + default: + break; + } + } - mlir::Type varType = fir::unwrapRefType(lhsAddr.getType()); - // Create a conversion outside the capture block. - auto insertionPoint = firOpBuilder.saveInsertionPoint(); - firOpBuilder.setInsertionPointAfter(rhsExpr.getDefiningOp()); - rhsExpr = firOpBuilder.createConvert(loc, varType, rhsExpr); - firOpBuilder.restoreInsertionPoint(insertionPoint); - - processOmpAtomicTODO(varType, loc); - - // If no hint clause is specified, the effect is as if - // hint(omp_sync_hint_none) had been specified. - mlir::IntegerAttr hint = nullptr; - mlir::omp::ClauseMemoryOrderKindAttr memoryOrder = nullptr; - if (leftHandClauseList) - genOmpAtomicHintAndMemoryOrderClauses(converter, *leftHandClauseList, hint, - memoryOrder); - if (rightHandClauseList) - genOmpAtomicHintAndMemoryOrderClauses(converter, *rightHandClauseList, hint, - memoryOrder); - firOpBuilder.create(loc, lhsAddr, rhsExpr, hint, - memoryOrder); -} - -/// Used to generate atomic.update operation which is created in existing -/// location set by builder. -static void genAtomicUpdateStatement( - lower::AbstractConverter &converter, mlir::Value lhsAddr, - mlir::Type varType, const parser::Variable &assignmentStmtVariable, - const parser::Expr &assignmentStmtExpr, - const parser::OmpAtomicClauseList *leftHandClauseList, - const parser::OmpAtomicClauseList *rightHandClauseList, mlir::Location loc, - mlir::Operation *atomicCaptureOp = nullptr, - lower::StatementContext *atomicCaptureStmtCtx = nullptr) { - // Generate `atomic.update` operation for atomic assignment statements - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); - mlir::Location currentLocation = converter.getCurrentLocation(); + // Starting with 5.1, if no memory-order clause is present, the effect + // is as if "relaxed" was present. + if (!kind) { + if (version <= 50) + return nullptr; + kind = mlir::omp::ClauseMemoryOrderKind::Relaxed; + } + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + return mlir::omp::ClauseMemoryOrderKindAttr::get(builder.getContext(), *kind); +} + +static mlir::Operation * // +genAtomicRead(lower::AbstractConverter &converter, mlir::Location loc, + lower::StatementContext &stmtCtx, mlir::Value atomAddr, + const semantics::SomeExpr &atom, + const evaluate::Assignment &assign, mlir::IntegerAttr hint, + mlir::omp::ClauseMemoryOrderKindAttr memOrder, + fir::FirOpBuilder::InsertPoint preAt, + fir::FirOpBuilder::InsertPoint atomicAt, + fir::FirOpBuilder::InsertPoint postAt) { + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + builder.restoreInsertionPoint(preAt); + + mlir::Value storeAddr = + fir::getBase(converter.genExprAddr(assign.lhs, stmtCtx, &loc)); + mlir::Type atomType = fir::unwrapRefType(atomAddr.getType()); + mlir::Type storeType = fir::unwrapRefType(storeAddr.getType()); + + mlir::Value toAddr = [&]() { + if (atomType == storeType) + return storeAddr; + return builder.createTemporary(loc, atomType, ".tmp.atomval"); + }(); - // Create the omp.atomic.update or acc.atomic.update operation - // - // func.func @_QPsb() { - // %0 = fir.alloca i32 {bindc_name = "a", uniq_name = "_QFsbEa"} - // %1 = fir.alloca i32 {bindc_name = "b", uniq_name = "_QFsbEb"} - // %2 = fir.load %1 : !fir.ref - // omp.atomic.update %0 : !fir.ref { - // ^bb0(%arg0: i32): - // %3 = arith.addi %arg0, %2 : i32 - // omp.yield(%3 : i32) - // } - // return - // } - - auto getArgExpression = - [](std::list::const_iterator it) { - const auto &arg{std::get((*it).t)}; - const auto *parserExpr{ - std::get_if>(&arg.u)}; - return parserExpr; - }; + builder.restoreInsertionPoint(atomicAt); + mlir::Operation *op = builder.create( + loc, atomAddr, toAddr, mlir::TypeAttr::get(atomType), hint, memOrder); + + if (atomType != storeType) { + lower::ExprToValueMap overrides; + // The READ operation could be a part of UPDATE CAPTURE, so make sure + // we don't emit extra code into the body of the atomic op. + builder.restoreInsertionPoint(postAt); + mlir::Value load = builder.create(loc, toAddr); + overrides.try_emplace(&atom, load); + + converter.overrideExprValues(&overrides); + mlir::Value value = + fir::getBase(converter.genExprValue(assign.rhs, stmtCtx, &loc)); + converter.resetExprOverrides(); - // Lower any non atomic sub-expression before the atomic operation, and - // map its lowered value to the semantic representation. - lower::ExprToValueMap exprValueOverrides; - // Max and min intrinsics can have a list of Args. Hence we need a list - // of nonAtomicSubExprs to hoist. Currently, only the load is hoisted. - llvm::SmallVector nonAtomicSubExprs; - common::visit( - common::visitors{ - [&](const common::Indirection &funcRef) - -> void { - const auto &args{std::get>( - funcRef.value().v.t)}; - std::list::const_iterator beginIt = - args.begin(); - std::list::const_iterator endIt = args.end(); - const auto *exprFirst{getArgExpression(beginIt)}; - if (exprFirst && exprFirst->value().source == - assignmentStmtVariable.GetSource()) { - // Add everything except the first - beginIt++; - } else { - // Add everything except the last - endIt--; - } - std::list::const_iterator it; - for (it = beginIt; it != endIt; it++) { - const common::Indirection *expr = - getArgExpression(it); - if (expr) - nonAtomicSubExprs.push_back(semantics::GetExpr(*expr)); - } - }, - [&](const auto &op) -> void { - using T = std::decay_t; - if constexpr (std::is_base_of::value) { - const auto &exprLeft{std::get<0>(op.t)}; - const auto &exprRight{std::get<1>(op.t)}; - if (exprLeft.value().source == assignmentStmtVariable.GetSource()) - nonAtomicSubExprs.push_back(semantics::GetExpr(exprRight)); - else - nonAtomicSubExprs.push_back(semantics::GetExpr(exprLeft)); - } - }, - }, - assignmentStmtExpr.u); - lower::StatementContext nonAtomicStmtCtx; - lower::StatementContext *stmtCtxPtr = &nonAtomicStmtCtx; - if (!nonAtomicSubExprs.empty()) { - // Generate non atomic part before all the atomic operations. - auto insertionPoint = firOpBuilder.saveInsertionPoint(); - if (atomicCaptureOp) { - assert(atomicCaptureStmtCtx && "must specify statement context"); - firOpBuilder.setInsertionPoint(atomicCaptureOp); - // Any clean-ups associated with the expression lowering - // must also be generated outside of the atomic update operation - // and after the atomic capture operation. - // The atomicCaptureStmtCtx will be finalized at the end - // of the atomic capture operation generation. - stmtCtxPtr = atomicCaptureStmtCtx; - } - mlir::Value nonAtomicVal; - for (auto *nonAtomicSubExpr : nonAtomicSubExprs) { - nonAtomicVal = fir::getBase(converter.genExprValue( - currentLocation, *nonAtomicSubExpr, *stmtCtxPtr)); - exprValueOverrides.try_emplace(nonAtomicSubExpr, nonAtomicVal); - } - if (atomicCaptureOp) - firOpBuilder.restoreInsertionPoint(insertionPoint); + builder.create(loc, value, storeAddr); } + return op; +} - mlir::Operation *atomicUpdateOp = nullptr; - // If no hint clause is specified, the effect is as if - // hint(omp_sync_hint_none) had been specified. - mlir::IntegerAttr hint = nullptr; - mlir::omp::ClauseMemoryOrderKindAttr memoryOrder = nullptr; - if (leftHandClauseList) - genOmpAtomicHintAndMemoryOrderClauses(converter, *leftHandClauseList, hint, - memoryOrder); - if (rightHandClauseList) - genOmpAtomicHintAndMemoryOrderClauses(converter, *rightHandClauseList, hint, - memoryOrder); - atomicUpdateOp = firOpBuilder.create( - currentLocation, lhsAddr, hint, memoryOrder); - - processOmpAtomicTODO(varType, loc); - - llvm::SmallVector varTys = {varType}; - llvm::SmallVector locs = {currentLocation}; - firOpBuilder.createBlock(&atomicUpdateOp->getRegion(0), {}, varTys, locs); - mlir::Value val = - fir::getBase(atomicUpdateOp->getRegion(0).front().getArgument(0)); - - exprValueOverrides.try_emplace(semantics::GetExpr(assignmentStmtVariable), - val); - { - // statement context inside the atomic block. - converter.overrideExprValues(&exprValueOverrides); - lower::StatementContext atomicStmtCtx; - mlir::Value rhsExpr = fir::getBase(converter.genExprValue( - *semantics::GetExpr(assignmentStmtExpr), atomicStmtCtx)); - mlir::Type exprType = fir::unwrapRefType(rhsExpr.getType()); - if (fir::isa_complex(exprType) && !fir::isa_complex(varType)) { - // Emit an additional `ExtractValueOp` if the expression is of complex - // type - auto extract = firOpBuilder.create( - currentLocation, - mlir::cast(exprType).getElementType(), rhsExpr, - firOpBuilder.getArrayAttr( - firOpBuilder.getIntegerAttr(firOpBuilder.getIndexType(), 0))); - mlir::Value convertResult = firOpBuilder.create( - currentLocation, varType, extract); - firOpBuilder.create(currentLocation, convertResult); - } else { - mlir::Value convertResult = - firOpBuilder.createConvert(currentLocation, varType, rhsExpr); - firOpBuilder.create(currentLocation, convertResult); +static mlir::Operation * // +genAtomicWrite(lower::AbstractConverter &converter, mlir::Location loc, + lower::StatementContext &stmtCtx, mlir::Value atomAddr, + const semantics::SomeExpr &atom, + const evaluate::Assignment &assign, mlir::IntegerAttr hint, + mlir::omp::ClauseMemoryOrderKindAttr memOrder, + fir::FirOpBuilder::InsertPoint preAt, + fir::FirOpBuilder::InsertPoint atomicAt, + fir::FirOpBuilder::InsertPoint postAt) { + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + builder.restoreInsertionPoint(preAt); + + mlir::Value value = + fir::getBase(converter.genExprValue(assign.rhs, stmtCtx, &loc)); + mlir::Type atomType = fir::unwrapRefType(atomAddr.getType()); + mlir::Value converted = builder.createConvert(loc, atomType, value); + + builder.restoreInsertionPoint(atomicAt); + mlir::Operation *op = builder.create( + loc, atomAddr, converted, hint, memOrder); + return op; +} + +static mlir::Operation * +genAtomicUpdate(lower::AbstractConverter &converter, mlir::Location loc, + lower::StatementContext &stmtCtx, mlir::Value atomAddr, + const semantics::SomeExpr &atom, + const evaluate::Assignment &assign, mlir::IntegerAttr hint, + mlir::omp::ClauseMemoryOrderKindAttr memOrder, + fir::FirOpBuilder::InsertPoint preAt, + fir::FirOpBuilder::InsertPoint atomicAt, + fir::FirOpBuilder::InsertPoint postAt) { + lower::ExprToValueMap overrides; + lower::StatementContext naCtx; + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + builder.restoreInsertionPoint(preAt); + + mlir::Type atomType = fir::unwrapRefType(atomAddr.getType()); + + // This must exist by now. + SomeExpr input = *semantics::GetConvertInput(assign.rhs); + std::vector args{semantics::GetTopLevelOperation(input).second}; + assert(!args.empty() && "Update operation without arguments"); + for (auto &arg : args) { + if (!semantics::IsSameOrConvertOf(arg, atom)) { + mlir::Value val = fir::getBase(converter.genExprValue(arg, naCtx, &loc)); + overrides.try_emplace(&arg, val); } - converter.resetExprOverrides(); } - firOpBuilder.setInsertionPointAfter(atomicUpdateOp); -} - -/// Processes an atomic construct with write clause. -static void genAtomicWrite(lower::AbstractConverter &converter, - const parser::OmpAtomicWrite &atomicWrite, - mlir::Location loc) { - const parser::OmpAtomicClauseList *rightHandClauseList = nullptr; - const parser::OmpAtomicClauseList *leftHandClauseList = nullptr; - // Get the address of atomic read operands. - rightHandClauseList = &std::get<2>(atomicWrite.t); - leftHandClauseList = &std::get<0>(atomicWrite.t); - - const parser::AssignmentStmt &stmt = - std::get>(atomicWrite.t) - .statement; - const evaluate::Assignment &assign = *stmt.typedAssignment->v; - lower::StatementContext stmtCtx; - // Get the value and address of atomic write operands. - mlir::Value rhsExpr = - fir::getBase(converter.genExprValue(assign.rhs, stmtCtx)); - mlir::Value lhsAddr = - fir::getBase(converter.genExprAddr(assign.lhs, stmtCtx)); - genAtomicWriteStatement(converter, lhsAddr, rhsExpr, leftHandClauseList, - rightHandClauseList, loc); -} - -/* - Emit an implicit cast. Different yet compatible types on - omp.atomic.read constitute valid Fortran. The OMPIRBuilder will - emit atomic instructions (on primitive types) and `__atomic_load` - libcall (on complex type) without explicitly converting - between such compatible types. The OMPIRBuilder relies on the - frontend to resolve such inconsistencies between `omp.atomic.read ` - operand types. Similar inconsistencies between operand types in - `omp.atomic.write` are resolved through implicit casting by use of typed - assignment (i.e. `evaluate::Assignment`). However, use of typed - assignment in `omp.atomic.read` (of form `v = x`) leads to an unsafe, - non-atomic load of `x` into a temporary `alloca`, followed by an atomic - read of form `v = alloca`. Hence, it is needed to perform a custom - implicit cast. - - An atomic read of form `v = x` would (without implicit casting) - lower to `omp.atomic.read %v = %x : !fir.ref, !fir.ref, - type2`. This implicit casting will rather generate the following FIR: - - %alloca = fir.alloca type2 - omp.atomic.read %alloca = %x : !fir.ref, !fir.ref, type2 - %load = fir.load %alloca : !fir.ref - %cvt = fir.convert %load : (type2) -> type1 - fir.store %cvt to %v : !fir.ref - - These sequence of operations is thread-safe since each thread allocates - the `alloca` in its stack, and performs `%alloca = %x` atomically. Once - safely read, each thread performs the implicit cast on the local - `alloca`, and writes the final result to `%v`. - -/// \param builder : FirOpBuilder -/// \param loc : Location for FIR generation -/// \param toAddress : Address of %v -/// \param toType : Type of %v -/// \param fromType : Type of %x -/// \param alloca : Thread scoped `alloca` -// It is the responsibility of the callee -// to position the `alloca` at `AllocaIP` -// through `builder.getAllocaBlock()` -*/ - -static void emitAtomicReadImplicitCast(fir::FirOpBuilder &builder, - mlir::Location loc, - mlir::Value toAddress, mlir::Type toType, - mlir::Type fromType, - mlir::Value alloca) { - auto load = builder.create(loc, alloca); - if (fir::isa_complex(fromType) && !fir::isa_complex(toType)) { - // Emit an additional `ExtractValueOp` if `fromAddress` is of complex - // type, but `toAddress` is not. - auto extract = builder.create( - loc, mlir::cast(fromType).getElementType(), load, - builder.getArrayAttr( - builder.getIntegerAttr(builder.getIndexType(), 0))); - auto cvt = builder.create(loc, toType, extract); - builder.create(loc, cvt, toAddress); - } else if (!fir::isa_complex(fromType) && fir::isa_complex(toType)) { - // Emit an additional `InsertValueOp` if `toAddress` is of complex - // type, but `fromAddress` is not. - mlir::Value undef = builder.create(loc, toType); - mlir::Type complexEleTy = - mlir::cast(toType).getElementType(); - mlir::Value cvt = builder.create(loc, complexEleTy, load); - mlir::Value zero = builder.createRealZeroConstant(loc, complexEleTy); - mlir::Value idx0 = builder.create( - loc, toType, undef, cvt, - builder.getArrayAttr( - builder.getIntegerAttr(builder.getIndexType(), 0))); - mlir::Value idx1 = builder.create( - loc, toType, idx0, zero, - builder.getArrayAttr( - builder.getIntegerAttr(builder.getIndexType(), 1))); - builder.create(loc, idx1, toAddress); - } else { - auto cvt = builder.create(loc, toType, load); - builder.create(loc, cvt, toAddress); - } -} -/// Processes an atomic construct with read clause. -static void genAtomicRead(lower::AbstractConverter &converter, - const parser::OmpAtomicRead &atomicRead, - mlir::Location loc) { - const parser::OmpAtomicClauseList *rightHandClauseList = nullptr; - const parser::OmpAtomicClauseList *leftHandClauseList = nullptr; - // Get the address of atomic read operands. - rightHandClauseList = &std::get<2>(atomicRead.t); - leftHandClauseList = &std::get<0>(atomicRead.t); + builder.restoreInsertionPoint(atomicAt); + auto updateOp = + builder.create(loc, atomAddr, hint, memOrder); - const auto &assignmentStmtExpr = std::get( - std::get>(atomicRead.t) - .statement.t); - const auto &assignmentStmtVariable = std::get( - std::get>(atomicRead.t) - .statement.t); + mlir::Region ®ion = updateOp->getRegion(0); + mlir::Block *block = builder.createBlock(®ion, {}, {atomType}, {loc}); + mlir::Value localAtom = fir::getBase(block->getArgument(0)); + overrides.try_emplace(&atom, localAtom); - lower::StatementContext stmtCtx; - const semantics::SomeExpr &fromExpr = *semantics::GetExpr(assignmentStmtExpr); - mlir::Type elementType = converter.genType(fromExpr); - mlir::Value fromAddress = - fir::getBase(converter.genExprAddr(fromExpr, stmtCtx)); - mlir::Value toAddress = fir::getBase(converter.genExprAddr( - *semantics::GetExpr(assignmentStmtVariable), stmtCtx)); - - if (fromAddress.getType() != toAddress.getType()) { - - mlir::Type toType = fir::unwrapRefType(toAddress.getType()); - mlir::Type fromType = fir::unwrapRefType(fromAddress.getType()); - fir::FirOpBuilder &builder = converter.getFirOpBuilder(); - auto oldIP = builder.saveInsertionPoint(); - builder.setInsertionPointToStart(builder.getAllocaBlock()); - mlir::Value alloca = builder.create( - loc, fromType); // Thread scope `alloca` to atomically read `%x`. - builder.restoreInsertionPoint(oldIP); - genAtomicCaptureStatement(converter, fromAddress, alloca, - leftHandClauseList, rightHandClauseList, - elementType, loc); - emitAtomicReadImplicitCast(builder, loc, toAddress, toType, fromType, - alloca); - } else - genAtomicCaptureStatement(converter, fromAddress, toAddress, - leftHandClauseList, rightHandClauseList, - elementType, loc); -} - -/// Processes an atomic construct with update clause. -static void genAtomicUpdate(lower::AbstractConverter &converter, - const parser::OmpAtomicUpdate &atomicUpdate, - mlir::Location loc) { - const parser::OmpAtomicClauseList *rightHandClauseList = nullptr; - const parser::OmpAtomicClauseList *leftHandClauseList = nullptr; - // Get the address of atomic read operands. - rightHandClauseList = &std::get<2>(atomicUpdate.t); - leftHandClauseList = &std::get<0>(atomicUpdate.t); - - const auto &assignmentStmtExpr = std::get( - std::get>(atomicUpdate.t) - .statement.t); - const auto &assignmentStmtVariable = std::get( - std::get>(atomicUpdate.t) - .statement.t); + converter.overrideExprValues(&overrides); + mlir::Value updated = + fir::getBase(converter.genExprValue(assign.rhs, stmtCtx, &loc)); + mlir::Value converted = builder.createConvert(loc, atomType, updated); + builder.create(loc, converted); + converter.resetExprOverrides(); - lower::StatementContext stmtCtx; - mlir::Value lhsAddr = fir::getBase(converter.genExprAddr( - *semantics::GetExpr(assignmentStmtVariable), stmtCtx)); - mlir::Type varType = fir::unwrapRefType(lhsAddr.getType()); - genAtomicUpdateStatement(converter, lhsAddr, varType, assignmentStmtVariable, - assignmentStmtExpr, leftHandClauseList, - rightHandClauseList, loc); -} - -/// Processes an atomic construct with no clause - which implies update clause. -static void genOmpAtomic(lower::AbstractConverter &converter, - const parser::OmpAtomic &atomicConstruct, - mlir::Location loc) { - const parser::OmpAtomicClauseList &atomicClauseList = - std::get(atomicConstruct.t); - const auto &assignmentStmtExpr = std::get( - std::get>(atomicConstruct.t) - .statement.t); - const auto &assignmentStmtVariable = std::get( - std::get>(atomicConstruct.t) - .statement.t); - lower::StatementContext stmtCtx; - mlir::Value lhsAddr = fir::getBase(converter.genExprAddr( - *semantics::GetExpr(assignmentStmtVariable), stmtCtx)); - mlir::Type varType = fir::unwrapRefType(lhsAddr.getType()); - // If atomic-clause is not present on the construct, the behaviour is as if - // the update clause is specified (for both OpenMP and OpenACC). - genAtomicUpdateStatement(converter, lhsAddr, varType, assignmentStmtVariable, - assignmentStmtExpr, &atomicClauseList, nullptr, loc); -} - -/// Processes an atomic construct with capture clause. -static void genAtomicCapture(lower::AbstractConverter &converter, - const parser::OmpAtomicCapture &atomicCapture, - mlir::Location loc) { - fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder(); + builder.restoreInsertionPoint(postAt); // For naCtx cleanups + return updateOp; +} - const parser::AssignmentStmt &stmt1 = - std::get(atomicCapture.t).v.statement; - const evaluate::Assignment &assign1 = *stmt1.typedAssignment->v; - const auto &stmt1Var{std::get(stmt1.t)}; - const auto &stmt1Expr{std::get(stmt1.t)}; - const parser::AssignmentStmt &stmt2 = - std::get(atomicCapture.t).v.statement; - const evaluate::Assignment &assign2 = *stmt2.typedAssignment->v; - const auto &stmt2Var{std::get(stmt2.t)}; - const auto &stmt2Expr{std::get(stmt2.t)}; - - // Pre-evaluate expressions to be used in the various operations inside - // `atomic.capture` since it is not desirable to have anything other than - // a `atomic.read`, `atomic.write`, or `atomic.update` operation - // inside `atomic.capture` - lower::StatementContext stmtCtx; - // LHS evaluations are common to all combinations of `atomic.capture` - mlir::Value stmt1LHSArg = - fir::getBase(converter.genExprAddr(assign1.lhs, stmtCtx)); - mlir::Value stmt2LHSArg = - fir::getBase(converter.genExprAddr(assign2.lhs, stmtCtx)); - - // Type information used in generation of `atomic.update` operation - mlir::Type stmt1VarType = - fir::getBase(converter.genExprValue(assign1.lhs, stmtCtx)).getType(); - mlir::Type stmt2VarType = - fir::getBase(converter.genExprValue(assign2.lhs, stmtCtx)).getType(); - - mlir::Operation *atomicCaptureOp = nullptr; - mlir::IntegerAttr hint = nullptr; - mlir::omp::ClauseMemoryOrderKindAttr memoryOrder = nullptr; - const parser::OmpAtomicClauseList &rightHandClauseList = - std::get<2>(atomicCapture.t); - const parser::OmpAtomicClauseList &leftHandClauseList = - std::get<0>(atomicCapture.t); - genOmpAtomicHintAndMemoryOrderClauses(converter, leftHandClauseList, hint, - memoryOrder); - genOmpAtomicHintAndMemoryOrderClauses(converter, rightHandClauseList, hint, - memoryOrder); - atomicCaptureOp = - firOpBuilder.create(loc, hint, memoryOrder); - - firOpBuilder.createBlock(&(atomicCaptureOp->getRegion(0))); - mlir::Block &block = atomicCaptureOp->getRegion(0).back(); - firOpBuilder.setInsertionPointToStart(&block); - if (parser::CheckForSingleVariableOnRHS(stmt1)) { - if (semantics::CheckForSymbolMatch(semantics::GetExpr(stmt2Var), - semantics::GetExpr(stmt2Expr))) { - // Atomic capture construct is of the form [capture-stmt, update-stmt] - const semantics::SomeExpr &fromExpr = *semantics::GetExpr(stmt1Expr); - mlir::Type elementType = converter.genType(fromExpr); - if (stmt1VarType != stmt2VarType) { - mlir::Value alloca; - mlir::Type toType = fir::unwrapRefType(stmt1LHSArg.getType()); - mlir::Type fromType = fir::unwrapRefType(stmt2LHSArg.getType()); - { - mlir::OpBuilder::InsertionGuard guard(firOpBuilder); - firOpBuilder.setInsertionPointToStart(firOpBuilder.getAllocaBlock()); - alloca = firOpBuilder.create(loc, fromType); - } - genAtomicCaptureStatement(converter, stmt2LHSArg, alloca, - /*leftHandClauseList=*/nullptr, - /*rightHandClauseList=*/nullptr, elementType, - loc); - { - mlir::OpBuilder::InsertionGuard guard(firOpBuilder); - firOpBuilder.setInsertionPointAfter(atomicCaptureOp); - emitAtomicReadImplicitCast(firOpBuilder, loc, stmt1LHSArg, toType, - fromType, alloca); - } - } else { - genAtomicCaptureStatement(converter, stmt2LHSArg, stmt1LHSArg, - /*leftHandClauseList=*/nullptr, - /*rightHandClauseList=*/nullptr, elementType, - loc); - } - genAtomicUpdateStatement( - converter, stmt2LHSArg, stmt2VarType, stmt2Var, stmt2Expr, - /*leftHandClauseList=*/nullptr, - /*rightHandClauseList=*/nullptr, loc, atomicCaptureOp, &stmtCtx); - } else { - // Atomic capture construct is of the form [capture-stmt, write-stmt] - firOpBuilder.setInsertionPoint(atomicCaptureOp); - mlir::Value stmt2RHSArg = - fir::getBase(converter.genExprValue(assign2.rhs, stmtCtx)); - firOpBuilder.setInsertionPointToStart(&block); - const semantics::SomeExpr &fromExpr = *semantics::GetExpr(stmt1Expr); - mlir::Type elementType = converter.genType(fromExpr); - - if (stmt1VarType != stmt2VarType) { - mlir::Value alloca; - mlir::Type toType = fir::unwrapRefType(stmt1LHSArg.getType()); - mlir::Type fromType = fir::unwrapRefType(stmt2LHSArg.getType()); - { - mlir::OpBuilder::InsertionGuard guard(firOpBuilder); - firOpBuilder.setInsertionPointToStart(firOpBuilder.getAllocaBlock()); - alloca = firOpBuilder.create(loc, fromType); - } - genAtomicCaptureStatement(converter, stmt2LHSArg, alloca, - /*leftHandClauseList=*/nullptr, - /*rightHandClauseList=*/nullptr, elementType, - loc); - { - mlir::OpBuilder::InsertionGuard guard(firOpBuilder); - firOpBuilder.setInsertionPointAfter(atomicCaptureOp); - emitAtomicReadImplicitCast(firOpBuilder, loc, stmt1LHSArg, toType, - fromType, alloca); - } - } else { - genAtomicCaptureStatement(converter, stmt2LHSArg, stmt1LHSArg, - /*leftHandClauseList=*/nullptr, - /*rightHandClauseList=*/nullptr, elementType, - loc); - } - genAtomicWriteStatement(converter, stmt2LHSArg, stmt2RHSArg, - /*leftHandClauseList=*/nullptr, - /*rightHandClauseList=*/nullptr, loc); - } - } else { - // Atomic capture construct is of the form [update-stmt, capture-stmt] - const semantics::SomeExpr &fromExpr = *semantics::GetExpr(stmt2Expr); - mlir::Type elementType = converter.genType(fromExpr); - genAtomicUpdateStatement( - converter, stmt1LHSArg, stmt1VarType, stmt1Var, stmt1Expr, - /*leftHandClauseList=*/nullptr, - /*rightHandClauseList=*/nullptr, loc, atomicCaptureOp, &stmtCtx); - - if (stmt1VarType != stmt2VarType) { - mlir::Value alloca; - mlir::Type toType = fir::unwrapRefType(stmt2LHSArg.getType()); - mlir::Type fromType = fir::unwrapRefType(stmt1LHSArg.getType()); - - { - mlir::OpBuilder::InsertionGuard guard(firOpBuilder); - firOpBuilder.setInsertionPointToStart(firOpBuilder.getAllocaBlock()); - alloca = firOpBuilder.create(loc, fromType); - } +static mlir::Operation * +genAtomicOperation(lower::AbstractConverter &converter, mlir::Location loc, + lower::StatementContext &stmtCtx, int action, + mlir::Value atomAddr, const semantics::SomeExpr &atom, + const evaluate::Assignment &assign, mlir::IntegerAttr hint, + mlir::omp::ClauseMemoryOrderKindAttr memOrder, + fir::FirOpBuilder::InsertPoint preAt, + fir::FirOpBuilder::InsertPoint atomicAt, + fir::FirOpBuilder::InsertPoint postAt) { + if (isPointerAssignment(assign)) { + TODO(loc, "Code generation for pointer assignment is not implemented yet"); + } - genAtomicCaptureStatement(converter, stmt1LHSArg, alloca, - /*leftHandClauseList=*/nullptr, - /*rightHandClauseList=*/nullptr, elementType, - loc); - { - mlir::OpBuilder::InsertionGuard guard(firOpBuilder); - firOpBuilder.setInsertionPointAfter(atomicCaptureOp); - emitAtomicReadImplicitCast(firOpBuilder, loc, stmt2LHSArg, toType, - fromType, alloca); - } - } else { - genAtomicCaptureStatement(converter, stmt1LHSArg, stmt2LHSArg, - /*leftHandClauseList=*/nullptr, - /*rightHandClauseList=*/nullptr, elementType, - loc); - } + // This function and the functions called here do not preserve the + // builder's insertion point, or set it to anything specific. + switch (action) { + case parser::OpenMPAtomicConstruct::Analysis::Read: + return genAtomicRead(converter, loc, stmtCtx, atomAddr, atom, assign, hint, + memOrder, preAt, atomicAt, postAt); + case parser::OpenMPAtomicConstruct::Analysis::Write: + return genAtomicWrite(converter, loc, stmtCtx, atomAddr, atom, assign, hint, + memOrder, preAt, atomicAt, postAt); + case parser::OpenMPAtomicConstruct::Analysis::Update: + return genAtomicUpdate(converter, loc, stmtCtx, atomAddr, atom, assign, + hint, memOrder, preAt, atomicAt, postAt); + default: + return nullptr; } - firOpBuilder.setInsertionPointToEnd(&block); - firOpBuilder.create(loc); - // The clean-ups associated with the statements inside the capture - // construct must be generated after the AtomicCaptureOp. - firOpBuilder.setInsertionPointAfter(atomicCaptureOp); } //===----------------------------------------------------------------------===// @@ -4212,10 +3795,6 @@ genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, standaloneConstruct.u); } -//===----------------------------------------------------------------------===// -// OpenMPConstruct visitors -//===----------------------------------------------------------------------===// - static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, @@ -4223,38 +3802,164 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, TODO(converter.getCurrentLocation(), "OpenMPAllocatorsConstruct"); } +//===----------------------------------------------------------------------===// +// OpenMPConstruct visitors +//===----------------------------------------------------------------------===// + +[[maybe_unused]] static void +dumpAtomicAnalysis(const parser::OpenMPAtomicConstruct::Analysis &analysis) { + auto whatStr = [](int k) { + std::string txt = "?"; + switch (k & parser::OpenMPAtomicConstruct::Analysis::Action) { + case parser::OpenMPAtomicConstruct::Analysis::None: + txt = "None"; + break; + case parser::OpenMPAtomicConstruct::Analysis::Read: + txt = "Read"; + break; + case parser::OpenMPAtomicConstruct::Analysis::Write: + txt = "Write"; + break; + case parser::OpenMPAtomicConstruct::Analysis::Update: + txt = "Update"; + break; + } + switch (k & parser::OpenMPAtomicConstruct::Analysis::Condition) { + case parser::OpenMPAtomicConstruct::Analysis::IfTrue: + txt += " | IfTrue"; + break; + case parser::OpenMPAtomicConstruct::Analysis::IfFalse: + txt += " | IfFalse"; + break; + } + return txt; + }; + + auto exprStr = [&](const parser::TypedExpr &expr) { + if (auto *maybe = expr.get()) { + if (maybe->v) + return maybe->v->AsFortran(); + } + return ""s; + }; + auto assignStr = [&](const parser::AssignmentStmt::TypedAssignment &assign) { + if (auto *maybe = assign.get(); maybe && maybe->v) { + std::string str; + llvm::raw_string_ostream os(str); + maybe->v->AsFortran(os); + return str; + } + return ""s; + }; + + const SomeExpr &atom = *analysis.atom.get()->v; + + llvm::errs() << "Analysis {\n"; + llvm::errs() << " atom: " << atom.AsFortran() << "\n"; + llvm::errs() << " cond: " << exprStr(analysis.cond) << "\n"; + llvm::errs() << " op0 {\n"; + llvm::errs() << " what: " << whatStr(analysis.op0.what) << "\n"; + llvm::errs() << " assign: " << assignStr(analysis.op0.assign) << "\n"; + llvm::errs() << " }\n"; + llvm::errs() << " op1 {\n"; + llvm::errs() << " what: " << whatStr(analysis.op1.what) << "\n"; + llvm::errs() << " assign: " << assignStr(analysis.op1.assign) << "\n"; + llvm::errs() << " }\n"; + llvm::errs() << "}\n"; +} + static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, semantics::SemanticsContext &semaCtx, lower::pft::Evaluation &eval, - const parser::OpenMPAtomicConstruct &atomicConstruct) { - Fortran::common::visit( - common::visitors{ - [&](const parser::OmpAtomicRead &atomicRead) { - mlir::Location loc = converter.genLocation(atomicRead.source); - genAtomicRead(converter, atomicRead, loc); - }, - [&](const parser::OmpAtomicWrite &atomicWrite) { - mlir::Location loc = converter.genLocation(atomicWrite.source); - genAtomicWrite(converter, atomicWrite, loc); - }, - [&](const parser::OmpAtomic &atomicConstruct) { - mlir::Location loc = converter.genLocation(atomicConstruct.source); - genOmpAtomic(converter, atomicConstruct, loc); - }, - [&](const parser::OmpAtomicUpdate &atomicUpdate) { - mlir::Location loc = converter.genLocation(atomicUpdate.source); - genAtomicUpdate(converter, atomicUpdate, loc); - }, - [&](const parser::OmpAtomicCapture &atomicCapture) { - mlir::Location loc = converter.genLocation(atomicCapture.source); - genAtomicCapture(converter, atomicCapture, loc); - }, - [&](const parser::OmpAtomicCompare &atomicCompare) { - mlir::Location loc = converter.genLocation(atomicCompare.source); - TODO(loc, "OpenMP atomic compare"); - }, - }, - atomicConstruct.u); + const parser::OpenMPAtomicConstruct &construct) { + auto get = [](auto &&typedWrapper) -> decltype(&*typedWrapper.get()->v) { + if (auto *maybe = typedWrapper.get(); maybe && maybe->v) { + return &*maybe->v; + } else { + return nullptr; + } + }; + + fir::FirOpBuilder &builder = converter.getFirOpBuilder(); + auto &dirSpec = std::get(construct.t); + List clauses = makeClauses(dirSpec.Clauses(), semaCtx); + lower::StatementContext stmtCtx; + + const parser::OpenMPAtomicConstruct::Analysis &analysis = construct.analysis; + if (DumpAtomicAnalysis) + dumpAtomicAnalysis(analysis); + + const semantics::SomeExpr &atom = *get(analysis.atom); + mlir::Location loc = converter.genLocation(construct.source); + mlir::Value atomAddr = + fir::getBase(converter.genExprAddr(atom, stmtCtx, &loc)); + mlir::IntegerAttr hint = getAtomicHint(converter, clauses); + mlir::omp::ClauseMemoryOrderKindAttr memOrder = + getAtomicMemoryOrder(converter, semaCtx, clauses); + + if (auto *cond = get(analysis.cond)) { + (void)cond; + TODO(loc, "OpenMP ATOMIC COMPARE"); + } else { + int action0 = analysis.op0.what & analysis.Action; + int action1 = analysis.op1.what & analysis.Action; + mlir::Operation *captureOp = nullptr; + fir::FirOpBuilder::InsertPoint preAt = builder.saveInsertionPoint(); + fir::FirOpBuilder::InsertPoint atomicAt, postAt; + + if (construct.IsCapture()) { + // Capturing operation. + assert(action0 != analysis.None && action1 != analysis.None && + "Expexcing two actions"); + captureOp = + builder.create(loc, hint, memOrder); + // Set the non-atomic insertion point to before the atomic.capture. + preAt = getInsertionPointBefore(captureOp); + + mlir::Block *block = builder.createBlock(&captureOp->getRegion(0)); + builder.setInsertionPointToEnd(block); + // Set the atomic insertion point to before the terminator inside + // atomic.capture. + mlir::Operation *term = builder.create(loc); + atomicAt = getInsertionPointBefore(term); + postAt = getInsertionPointAfter(captureOp); + hint = nullptr; + memOrder = nullptr; + } else { + // Non-capturing operation. + assert(action0 != analysis.None && action1 == analysis.None && + "Expexcing single action"); + assert(!(analysis.op0.what & analysis.Condition)); + postAt = atomicAt = preAt; + } + + // The builder's insertion point needs to be specifically set before + // each call to `genAtomicOperation`. + mlir::Operation *firstOp = genAtomicOperation( + converter, loc, stmtCtx, analysis.op0.what, atomAddr, atom, + *get(analysis.op0.assign), hint, memOrder, preAt, atomicAt, postAt); + assert(firstOp && "Should have created an atomic operation"); + atomicAt = getInsertionPointAfter(firstOp); + + mlir::Operation *secondOp = nullptr; + if (analysis.op1.what != analysis.None) { + secondOp = genAtomicOperation(converter, loc, stmtCtx, analysis.op1.what, + atomAddr, atom, *get(analysis.op1.assign), + hint, memOrder, preAt, atomicAt, postAt); + } + + if (construct.IsCapture()) { + // If this is a capture operation, the first/second ops will be inside + // of it. Set the insertion point to past the capture op itself. + builder.restoreInsertionPoint(postAt); + } else { + if (secondOp) { + builder.setInsertionPointAfter(secondOp); + } else { + builder.setInsertionPointAfter(firstOp); + } + } + } } static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp index 08326fad8c143..9b112a2133918 100644 --- a/flang/lib/Parser/openmp-parsers.cpp +++ b/flang/lib/Parser/openmp-parsers.cpp @@ -24,6 +24,12 @@ // OpenMP Directives and Clauses namespace Fortran::parser { +// Helper function to print the buffer contents starting at the current point. +[[maybe_unused]] static std::string ahead(const ParseState &state) { + return std::string( + state.GetLocation(), std::min(64, state.BytesRemaining())); +} + constexpr auto startOmpLine = skipStuffBeforeStatement >> "!$OMP "_sptok; constexpr auto endOmpLine = space >> endOfLine; @@ -941,8 +947,10 @@ TYPE_PARSER( // parenthesized(Parser{}))) || "BIND" >> construct(construct( parenthesized(Parser{}))) || + "CAPTURE" >> construct(construct()) || "COLLAPSE" >> construct(construct( parenthesized(scalarIntConstantExpr))) || + "COMPARE" >> construct(construct()) || "CONTAINS" >> construct(construct( parenthesized(Parser{}))) || "COPYIN" >> construct(construct( @@ -1062,6 +1070,7 @@ TYPE_PARSER( // "TASK_REDUCTION" >> construct(construct( parenthesized(Parser{}))) || + "READ" >> construct(construct()) || "RELAXED" >> construct(construct()) || "RELEASE" >> construct(construct()) || "REVERSE_OFFLOAD" >> @@ -1105,6 +1114,7 @@ TYPE_PARSER( // maybe(Parser{}))) || "WHEN" >> construct(construct( parenthesized(Parser{}))) || + "WRITE" >> construct(construct()) || // Cancellable constructs construct(construct( Parser{}))) @@ -1223,6 +1233,155 @@ TYPE_PARSER(sourced(construct(first( TYPE_PARSER(sourced(construct( sourced(Parser{}), Parser{}))) +struct OmpEndDirectiveParser { + using resultType = OmpDirectiveSpecification; + + constexpr OmpEndDirectiveParser(llvm::omp::Directive dir) : dir_(dir) {} + + std::optional Parse(ParseState &state) const { + if ((startOmpLine >> "END"_sptok).Parse(state)) { + auto &&dirSpec{Parser{}.Parse(state)}; + if (dirSpec && dirSpec->DirId() == dir_) { + return std::move(dirSpec); + } + } + return std::nullopt; + } + +private: + llvm::omp::Directive dir_; +}; + +// Parser for an arbitrary OpenMP ATOMIC construct. +// +// Depending on circumstances, an ATOMIC construct applies to one or more +// following statements. In certain cases when a single statement is +// expected, the end-directive is optional. The specifics depend on both +// the clauses used, and the form of the executable statement. To emit +// more meaningful messages in case of errors, the exact analysis of the +// structure of the construct will be delayed until semantic checks. +// +// The parser will first try the case when the end-directive is present, +// and will parse at most "BodyLimit" (and potentially zero) constructs +// while looking for the end-directive before it gives up. +// Then it will assume that no end-directive is present, and will try to +// parse a single executable construct as the body of the construct. +// +// The limit on the number of constructs is there to reduce the amount of +// unnecessary parsing when the end-directive is absent. It's higher than +// the maximum number of statements in any valid construct to accept cases +// when extra statements are present by mistake. +// A problem can occur when atomic constructs without end-directive follow +// each other closely, e.g. +// !$omp atomic write +// x = v +// !$omp atomic update +// x = x + 1 +// ... +// The speculative parsing will become "recursive", and has the potential +// to take a (practically) infinite amount of time given a sufficiently +// large number of such constructs in a row. Since atomic constructs cannot +// contain other OpenMP constructs, guarding against recursive calls to the +// atomic construct parser solves the problem. +struct OmpAtomicConstructParser { + using resultType = OpenMPAtomicConstruct; + + static constexpr size_t BodyLimit{5}; + + std::optional Parse(ParseState &state) const { + if (recursing_) { + return std::nullopt; + } + recursing_ = true; + + auto dirSpec{Parser{}.Parse(state)}; + if (!dirSpec || dirSpec->DirId() != llvm::omp::Directive::OMPD_atomic) { + recursing_ = false; + return std::nullopt; + } + + auto exec{Parser{}}; + auto end{OmpEndDirectiveParser{llvm::omp::Directive::OMPD_atomic}}; + TailType tail; + + if (ParseOne(exec, end, tail, state)) { + if (!tail.first.empty()) { + if (auto &&rest{attempt(LimitedTailParser(BodyLimit)).Parse(state)}) { + for (auto &&s : rest->first) { + tail.first.emplace_back(std::move(s)); + } + assert(!tail.second); + tail.second = std::move(rest->second); + } + } + recursing_ = false; + return OpenMPAtomicConstruct{ + std::move(*dirSpec), std::move(tail.first), std::move(tail.second)}; + } + + recursing_ = false; + return std::nullopt; + } + +private: + // Begin-directive + TailType = entire construct. + using TailType = std::pair>; + + // Parse either an ExecutionPartConstruct, or atomic end-directive. When + // successful, record the result in the "tail" provided, otherwise fail. + static std::optional ParseOne( // + Parser &exec, OmpEndDirectiveParser &end, + TailType &tail, ParseState &state) { + auto isRecovery{[](const ExecutionPartConstruct &e) { + return std::holds_alternative(e.u); + }}; + if (auto &&stmt{attempt(exec).Parse(state)}; stmt && !isRecovery(*stmt)) { + tail.first.emplace_back(std::move(*stmt)); + } else if (auto &&dir{attempt(end).Parse(state)}) { + tail.second = std::move(*dir); + } else { + return std::nullopt; + } + return Success{}; + } + + struct LimitedTailParser { + using resultType = TailType; + + constexpr LimitedTailParser(size_t count) : count_(count) {} + + std::optional Parse(ParseState &state) const { + auto exec{Parser{}}; + auto end{OmpEndDirectiveParser{llvm::omp::Directive::OMPD_atomic}}; + TailType tail; + + for (size_t i{0}; i != count_; ++i) { + if (ParseOne(exec, end, tail, state)) { + if (tail.second) { + // Return when the end-directive was parsed. + return std::move(tail); + } + } else { + break; + } + } + return std::nullopt; + } + + private: + const size_t count_; + }; + + // The recursion guard should become thread_local if parsing is ever + // parallelized. + static bool recursing_; +}; + +bool OmpAtomicConstructParser::recursing_{false}; + +TYPE_PARSER(sourced( // + construct(OmpAtomicConstructParser{}))) + // 2.17.7 Atomic construct/2.17.8 Flush construct [OpenMP 5.0] // memory-order-clause -> // acq_rel @@ -1237,19 +1396,6 @@ TYPE_PARSER(sourced(construct( "RELEASE" >> construct(construct()) || "SEQ_CST" >> construct(construct()))))) -// 2.17.7 Atomic construct -// atomic-clause -> memory-order-clause | HINT(hint-expression) -TYPE_PARSER(sourced(construct( - construct(Parser{}) || - construct( - "FAIL" >> parenthesized(Parser{})) || - construct( - "HINT" >> parenthesized(Parser{}))))) - -// atomic-clause-list -> [atomic-clause, [atomic-clause], ...] -TYPE_PARSER(sourced(construct( - many(maybe(","_tok) >> sourced(Parser{}))))) - static bool IsSimpleStandalone(const OmpDirectiveName &name) { switch (name.v) { case llvm::omp::Directive::OMPD_barrier: @@ -1421,67 +1567,6 @@ TYPE_PARSER(sourced( TYPE_PARSER(construct(Parser{}) || construct(Parser{})) -// 2.17.7 atomic -> ATOMIC [clause [,]] atomic-clause [[,] clause] | -// ATOMIC [clause] -// clause -> memory-order-clause | HINT(hint-expression) -// memory-order-clause -> SEQ_CST | ACQ_REL | RELEASE | ACQUIRE | RELAXED -// atomic-clause -> READ | WRITE | UPDATE | CAPTURE - -// OMP END ATOMIC -TYPE_PARSER(construct(startOmpLine >> "END ATOMIC"_tok)) - -// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] READ [MEMORY-ORDER-CLAUSE-LIST] -TYPE_PARSER("ATOMIC" >> - sourced(construct( - Parser{} / maybe(","_tok), verbatim("READ"_tok), - Parser{} / endOmpLine, statement(assignmentStmt), - maybe(Parser{} / endOmpLine)))) - -// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] CAPTURE [MEMORY-ORDER-CLAUSE-LIST] -TYPE_PARSER("ATOMIC" >> - sourced(construct( - Parser{} / maybe(","_tok), verbatim("CAPTURE"_tok), - Parser{} / endOmpLine, statement(assignmentStmt), - statement(assignmentStmt), Parser{} / endOmpLine))) - -TYPE_PARSER(construct(indirect(Parser{})) || - construct(indirect(Parser{}))) - -// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] COMPARE [MEMORY-ORDER-CLAUSE-LIST] -TYPE_PARSER("ATOMIC" >> - sourced(construct( - Parser{} / maybe(","_tok), verbatim("COMPARE"_tok), - Parser{} / endOmpLine, - Parser{}, - maybe(Parser{} / endOmpLine)))) - -// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] UPDATE [MEMORY-ORDER-CLAUSE-LIST] -TYPE_PARSER("ATOMIC" >> - sourced(construct( - Parser{} / maybe(","_tok), verbatim("UPDATE"_tok), - Parser{} / endOmpLine, statement(assignmentStmt), - maybe(Parser{} / endOmpLine)))) - -// OMP ATOMIC [atomic-clause-list] -TYPE_PARSER(sourced(construct(verbatim("ATOMIC"_tok), - Parser{} / endOmpLine, statement(assignmentStmt), - maybe(Parser{} / endOmpLine)))) - -// OMP ATOMIC [MEMORY-ORDER-CLAUSE-LIST] WRITE [MEMORY-ORDER-CLAUSE-LIST] -TYPE_PARSER("ATOMIC" >> - sourced(construct( - Parser{} / maybe(","_tok), verbatim("WRITE"_tok), - Parser{} / endOmpLine, statement(assignmentStmt), - maybe(Parser{} / endOmpLine)))) - -// Atomic Construct -TYPE_PARSER(construct(Parser{}) || - construct(Parser{}) || - construct(Parser{}) || - construct(Parser{}) || - construct(Parser{}) || - construct(Parser{})) - // 2.13.2 OMP CRITICAL TYPE_PARSER(startOmpLine >> sourced(construct( diff --git a/flang/lib/Parser/parse-tree.cpp b/flang/lib/Parser/parse-tree.cpp index 3dd87ad9a3650..824612e49293f 100644 --- a/flang/lib/Parser/parse-tree.cpp +++ b/flang/lib/Parser/parse-tree.cpp @@ -321,6 +321,34 @@ std::string OmpTraitSetSelectorName::ToString() const { return std::string(EnumToString(v)); } +llvm::omp::Clause OpenMPAtomicConstruct::GetKind() const { + auto &dirSpec{std::get(t)}; + for (auto &clause : dirSpec.Clauses().v) { + switch (clause.Id()) { + case llvm::omp::Clause::OMPC_read: + case llvm::omp::Clause::OMPC_write: + case llvm::omp::Clause::OMPC_update: + return clause.Id(); + default: + break; + } + } + return llvm::omp::Clause::OMPC_update; +} + +bool OpenMPAtomicConstruct::IsCapture() const { + auto &dirSpec{std::get(t)}; + return llvm::any_of(dirSpec.Clauses().v, [](auto &clause) { + return clause.Id() == llvm::omp::Clause::OMPC_capture; + }); +} + +bool OpenMPAtomicConstruct::IsCompare() const { + auto &dirSpec{std::get(t)}; + return llvm::any_of(dirSpec.Clauses().v, [](auto &clause) { + return clause.Id() == llvm::omp::Clause::OMPC_compare; + }); +} } // namespace Fortran::parser template static llvm::omp::Clause getClauseIdForClass(C &&) { diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp index e0abe95d07c86..ed0f227fd5b98 100644 --- a/flang/lib/Parser/unparse.cpp +++ b/flang/lib/Parser/unparse.cpp @@ -2571,83 +2571,22 @@ class UnparseVisitor { Word(ToUpperCaseLetters(common::EnumToString(x))); } - void Unparse(const OmpAtomicClauseList &x) { Walk(" ", x.v, " "); } - - void Unparse(const OmpAtomic &x) { - BeginOpenMP(); - Word("!$OMP ATOMIC"); - Walk(std::get(x.t)); - Put("\n"); - EndOpenMP(); - Walk(std::get>(x.t)); - BeginOpenMP(); - Walk(std::get>(x.t), "!$OMP END ATOMIC\n"); - EndOpenMP(); - } - void Unparse(const OmpAtomicCapture &x) { - BeginOpenMP(); - Word("!$OMP ATOMIC"); - Walk(std::get<0>(x.t)); - Word(" CAPTURE"); - Walk(std::get<2>(x.t)); - Put("\n"); - EndOpenMP(); - Walk(std::get(x.t)); - Put("\n"); - Walk(std::get(x.t)); - BeginOpenMP(); - Word("!$OMP END ATOMIC\n"); - EndOpenMP(); - } - void Unparse(const OmpAtomicCompare &x) { - BeginOpenMP(); - Word("!$OMP ATOMIC"); - Walk(std::get<0>(x.t)); - Word(" COMPARE"); - Walk(std::get<2>(x.t)); - Put("\n"); - EndOpenMP(); - Walk(std::get(x.t)); - } - void Unparse(const OmpAtomicRead &x) { - BeginOpenMP(); - Word("!$OMP ATOMIC"); - Walk(std::get<0>(x.t)); - Word(" READ"); - Walk(std::get<2>(x.t)); - Put("\n"); - EndOpenMP(); - Walk(std::get>(x.t)); - BeginOpenMP(); - Walk(std::get>(x.t), "!$OMP END ATOMIC\n"); - EndOpenMP(); - } - void Unparse(const OmpAtomicUpdate &x) { + void Unparse(const OpenMPAtomicConstruct &x) { BeginOpenMP(); - Word("!$OMP ATOMIC"); - Walk(std::get<0>(x.t)); - Word(" UPDATE"); - Walk(std::get<2>(x.t)); - Put("\n"); - EndOpenMP(); - Walk(std::get>(x.t)); - BeginOpenMP(); - Walk(std::get>(x.t), "!$OMP END ATOMIC\n"); - EndOpenMP(); - } - void Unparse(const OmpAtomicWrite &x) { - BeginOpenMP(); - Word("!$OMP ATOMIC"); - Walk(std::get<0>(x.t)); - Word(" WRITE"); - Walk(std::get<2>(x.t)); + Word("!$OMP "); + Walk(std::get(x.t)); Put("\n"); EndOpenMP(); - Walk(std::get>(x.t)); - BeginOpenMP(); - Walk(std::get>(x.t), "!$OMP END ATOMIC\n"); - EndOpenMP(); + Walk(std::get(x.t), ""); + if (auto &end{std::get>(x.t)}) { + BeginOpenMP(); + Word("!$OMP END "); + Walk(*end); + Put("\n"); + EndOpenMP(); + } } + void Unparse(const OpenMPExecutableAllocate &x) { const auto &fields = std::get>>( @@ -2920,23 +2859,8 @@ class UnparseVisitor { Put("\n"); EndOpenMP(); } + void Unparse(const OmpFailClause &x) { Walk(x.v); } void Unparse(const OmpMemoryOrderClause &x) { Walk(x.v); } - void Unparse(const OmpAtomicClause &x) { - common::visit(common::visitors{ - [&](const OmpMemoryOrderClause &y) { Walk(y); }, - [&](const OmpFailClause &y) { - Word("FAIL("); - Walk(y.v); - Put(")"); - }, - [&](const OmpHintClause &y) { - Word("HINT("); - Walk(y.v); - Put(")"); - }, - }, - x.u); - } void Unparse(const OmpMetadirectiveDirective &x) { BeginOpenMP(); Word("!$OMP METADIRECTIVE "); diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp index 31fcbb9683202..4dccb0e88e324 100644 --- a/flang/lib/Semantics/check-omp-structure.cpp +++ b/flang/lib/Semantics/check-omp-structure.cpp @@ -17,10 +17,16 @@ #include "flang/Semantics/openmp-modifiers.h" #include "flang/Semantics/tools.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringSwitch.h" #include namespace Fortran::semantics { +template +static bool operator!=(const evaluate::Expr &e, const evaluate::Expr &f) { + return !(e == f); +} + // Use when clause falls under 'struct OmpClause' in 'parse-tree.h'. #define CHECK_SIMPLE_CLAUSE(X, Y) \ void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \ @@ -79,6 +85,32 @@ static const parser::ArrayElement *GetArrayElementFromObj( return nullptr; } +static bool IsVarOrFunctionRef(const MaybeExpr &expr) { + if (expr) { + return evaluate::UnwrapProcedureRef(*expr) != nullptr || + evaluate::IsVariable(*expr); + } else { + return false; + } +} + +static std::optional GetEvaluateExpr(const parser::Expr &parserExpr) { + const parser::TypedExpr &typedExpr{parserExpr.typedExpr}; + // ForwardOwningPointer typedExpr + // `- GenericExprWrapper ^.get() + // `- std::optional ^->v + return typedExpr.get()->v; +} + +static std::optional GetDynamicType( + const parser::Expr &parserExpr) { + if (auto maybeExpr{GetEvaluateExpr(parserExpr)}) { + return maybeExpr->GetType(); + } else { + return std::nullopt; + } +} + // 'OmpWorkshareBlockChecker' is used to check the validity of the assignment // statements and the expressions enclosed in an OpenMP Workshare construct class OmpWorkshareBlockChecker { @@ -595,51 +627,26 @@ void OmpStructureChecker::CheckPredefinedAllocatorRestriction( } } -template -void OmpStructureChecker::CheckHintClause( - D *leftOmpClauseList, D *rightOmpClauseList, std::string_view dirName) { - bool foundHint{false}; +void OmpStructureChecker::Enter(const parser::OmpClause::Hint &x) { + CheckAllowedClause(llvm::omp::Clause::OMPC_hint); + auto &dirCtx{GetContext()}; - auto checkForValidHintClause = [&](const D *clauseList) { - for (const auto &clause : clauseList->v) { - const parser::OmpHintClause *ompHintClause = nullptr; - if constexpr (std::is_same_v) { - ompHintClause = std::get_if(&clause.u); - } else if constexpr (std::is_same_v) { - if (auto *hint{std::get_if(&clause.u)}) { - ompHintClause = &hint->v; - } - } - if (!ompHintClause) - continue; - if (foundHint) { - context_.Say(clause.source, - "At most one HINT clause can appear on the %s directive"_err_en_US, - parser::ToUpperCaseLetters(dirName)); - } - foundHint = true; - std::optional hintValue = GetIntValue(ompHintClause->v); - if (hintValue && *hintValue >= 0) { - /*`omp_sync_hint_nonspeculative` and `omp_lock_hint_speculative`*/ - if ((*hintValue & 0xC) == 0xC - /*`omp_sync_hint_uncontended` and omp_sync_hint_contended*/ - || (*hintValue & 0x3) == 0x3) - context_.Say(clause.source, - "Hint clause value " - "is not a valid OpenMP synchronization value"_err_en_US); - } else { - context_.Say(clause.source, - "Hint clause must have non-negative constant " - "integer expression"_err_en_US); + if (std::optional maybeVal{GetIntValue(x.v.v)}) { + int64_t val{*maybeVal}; + if (val >= 0) { + // Check contradictory values. + if ((val & 0xC) == 0xC || // omp_sync_hint_speculative and nonspeculative + (val & 0x3) == 0x3) { // omp_sync_hint_contended and uncontended + context_.Say(dirCtx.clauseSource, + "The synchronization hint is not valid"_err_en_US); } + } else { + context_.Say(dirCtx.clauseSource, + "Synchronization hint must be non-negative"_err_en_US); } - }; - - if (leftOmpClauseList) { - checkForValidHintClause(leftOmpClauseList); - } - if (rightOmpClauseList) { - checkForValidHintClause(rightOmpClauseList); + } else { + context_.Say(dirCtx.clauseSource, + "Synchronization hint must be a constant integer value"_err_en_US); } } @@ -2396,8 +2403,9 @@ void OmpStructureChecker::Leave(const parser::OpenMPCancelConstruct &) { void OmpStructureChecker::Enter(const parser::OpenMPCriticalConstruct &x) { const auto &dir{std::get(x.t)}; + const auto &dirSource{std::get(dir.t).source}; const auto &endDir{std::get(x.t)}; - PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_critical); + PushContextAndClauseSets(dirSource, llvm::omp::Directive::OMPD_critical); const auto &block{std::get(x.t)}; CheckNoBranching(block, llvm::omp::Directive::OMPD_critical, dir.source); const auto &dirName{std::get>(dir.t)}; @@ -2430,7 +2438,6 @@ void OmpStructureChecker::Enter(const parser::OpenMPCriticalConstruct &x) { "Hint clause other than omp_sync_hint_none cannot be specified for " "an unnamed CRITICAL directive"_err_en_US}); } - CheckHintClause(&ompClause, nullptr, "CRITICAL"); } void OmpStructureChecker::Leave(const parser::OpenMPCriticalConstruct &) { @@ -2667,422 +2674,1418 @@ void OmpStructureChecker::Leave(const parser::OmpEndBlockDirective &x) { } } -inline void OmpStructureChecker::ErrIfAllocatableVariable( - const parser::Variable &var) { - // Err out if the given symbol has - // ALLOCATABLE attribute - if (const auto *e{GetExpr(context_, var)}) - for (const Symbol &symbol : evaluate::CollectSymbols(*e)) - if (IsAllocatable(symbol)) { - const auto &designator = - std::get>(var.u); - const auto *dataRef = - std::get_if(&designator.value().u); - const parser::Name *name = - dataRef ? std::get_if(&dataRef->u) : nullptr; - if (name) - context_.Say(name->source, - "%s must not have ALLOCATABLE " - "attribute"_err_en_US, - name->ToString()); +/// parser::Block is a list of executable constructs, parser::BlockConstruct +/// is Fortran's BLOCK/ENDBLOCK construct. +/// Strip the outermost BlockConstructs, return the reference to the Block +/// in the executable part of the innermost of the stripped constructs. +/// Specifically, if the given `block` has a single entry (it's a list), and +/// the entry is a BlockConstruct, get the Block contained within. Repeat +/// this step as many times as possible. +static const parser::Block &GetInnermostExecPart(const parser::Block &block) { + const parser::Block *iter{&block}; + while (iter->size() == 1) { + const parser::ExecutionPartConstruct &ep{iter->front()}; + if (auto *exec{std::get_if(&ep.u)}) { + using BlockConstruct = common::Indirection; + if (auto *bc{std::get_if(&exec->u)}) { + iter = &std::get(bc->value().t); + continue; } + } + break; + } + return *iter; } -inline void OmpStructureChecker::ErrIfLHSAndRHSSymbolsMatch( - const parser::Variable &var, const parser::Expr &expr) { - // Err out if the symbol on the LHS is also used on the RHS of the assignment - // statement - const auto *e{GetExpr(context_, expr)}; - const auto *v{GetExpr(context_, var)}; - if (e && v) { - auto vSyms{evaluate::GetSymbolVector(*v)}; - const Symbol &varSymbol = vSyms.front(); - for (const Symbol &symbol : evaluate::GetSymbolVector(*e)) { - if (varSymbol == symbol) { - const common::Indirection *designator = - std::get_if>(&expr.u); - if (designator) { - auto *z{var.typedExpr.get()}; - auto *c{expr.typedExpr.get()}; - if (z->v == c->v) { - context_.Say(expr.source, - "RHS expression on atomic assignment statement cannot access '%s'"_err_en_US, - var.GetSource()); - } +// There is no consistent way to get the source of a given ActionStmt, so +// extract the source information from Statement when we can, +// and keep it around for error reporting in further analyses. +struct SourcedActionStmt { + const parser::ActionStmt *stmt{nullptr}; + parser::CharBlock source; + + operator bool() const { return stmt != nullptr; } +}; + +struct AnalyzedCondStmt { + SomeExpr cond{evaluate::NullPointer{}}; // Default ctor is deleted + parser::CharBlock source; + SourcedActionStmt ift, iff; +}; + +static SourcedActionStmt GetActionStmt( + const parser::ExecutionPartConstruct *x) { + if (x == nullptr) { + return SourcedActionStmt{}; + } + if (auto *exec{std::get_if(&x->u)}) { + using ActionStmt = parser::Statement; + if (auto *stmt{std::get_if(&exec->u)}) { + return SourcedActionStmt{&stmt->statement, stmt->source}; + } + } + return SourcedActionStmt{}; +} + +static SourcedActionStmt GetActionStmt(const parser::Block &block) { + if (block.size() == 1) { + return GetActionStmt(&block.front()); + } + return SourcedActionStmt{}; +} + +// Compute the `evaluate::Assignment` from parser::ActionStmt. The assumption +// is that the ActionStmt will be either an assignment or a pointer-assignment, +// otherwise return std::nullopt. +// Note: This function can return std::nullopt on [Pointer]AssignmentStmt where +// the "typedAssignment" is unset. This can happen if there are semantic errors +// in the purported assignment. +static std::optional GetEvaluateAssignment( + const parser::ActionStmt *x) { + if (x == nullptr) { + return std::nullopt; + } + + using AssignmentStmt = common::Indirection; + using PointerAssignmentStmt = + common::Indirection; + using TypedAssignment = parser::AssignmentStmt::TypedAssignment; + + return common::visit( + [](auto &&s) -> std::optional { + using BareS = llvm::remove_cvref_t; + if constexpr (std::is_same_v || + std::is_same_v) { + const TypedAssignment &typed{s.value().typedAssignment}; + // ForwardOwningPointer typedAssignment + // `- GenericAssignmentWrapper ^.get() + // `- std::optional ^->v + return typed.get()->v; } else { - context_.Say(expr.source, - "RHS expression on atomic assignment statement cannot access '%s'"_err_en_US, - var.GetSource()); + return std::nullopt; + } + }, + x->u); +} + +// Check if the ActionStmt is actually a [Pointer]AssignmentStmt. This is +// to separate cases where the source has something that looks like an +// assignment, but is semantically wrong (diagnosed by general semantic +// checks), and where the source has some other statement (which we want +// to report as "should be an assignment"). +static bool IsAssignment(const parser::ActionStmt *x) { + if (x == nullptr) { + return false; + } + + using AssignmentStmt = common::Indirection; + using PointerAssignmentStmt = + common::Indirection; + + return common::visit( + [](auto &&s) -> bool { + using BareS = llvm::remove_cvref_t; + return std::is_same_v || + std::is_same_v; + }, + x->u); +} + +static std::optional AnalyzeConditionalStmt( + const parser::ExecutionPartConstruct *x) { + if (x == nullptr) { + return std::nullopt; + } + + // Extract the evaluate::Expr from ScalarLogicalExpr. + auto getFromLogical{[](const parser::ScalarLogicalExpr &logical) { + // ScalarLogicalExpr is Scalar>> + const parser::Expr &expr{logical.thing.thing.value()}; + return GetEvaluateExpr(expr); + }}; + + // Recognize either + // ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> IfStmt, or + // ExecutionPartConstruct -> ExecutableConstruct -> IfConstruct. + + if (auto &&action{GetActionStmt(x)}) { + if (auto *ifs{std::get_if>( + &action.stmt->u)}) { + const parser::IfStmt &s{ifs->value()}; + auto &&maybeCond{ + getFromLogical(std::get(s.t))}; + auto &thenStmt{ + std::get>(s.t)}; + if (maybeCond) { + return AnalyzedCondStmt{std::move(*maybeCond), action.source, + SourcedActionStmt{&thenStmt.statement, thenStmt.source}, + SourcedActionStmt{}}; + } + } + return std::nullopt; + } + + if (auto *exec{std::get_if(&x->u)}) { + if (auto *ifc{ + std::get_if>(&exec->u)}) { + using ElseBlock = parser::IfConstruct::ElseBlock; + using ElseIfBlock = parser::IfConstruct::ElseIfBlock; + const parser::IfConstruct &s{ifc->value()}; + + if (!std::get>(s.t).empty()) { + // Not expecting any else-if statements. + return std::nullopt; + } + auto &stmt{std::get>(s.t)}; + auto &&maybeCond{getFromLogical( + std::get(stmt.statement.t))}; + if (!maybeCond) { + return std::nullopt; + } + + if (auto &maybeElse{std::get>(s.t)}) { + AnalyzedCondStmt result{std::move(*maybeCond), stmt.source, + GetActionStmt(std::get(s.t)), + GetActionStmt(std::get(maybeElse->t))}; + if (result.ift.stmt && result.iff.stmt) { + return result; + } + } else { + AnalyzedCondStmt result{std::move(*maybeCond), stmt.source, + GetActionStmt(std::get(s.t)), SourcedActionStmt{}}; + if (result.ift.stmt) { + return result; } } } + return std::nullopt; } + + return std::nullopt; } -inline void OmpStructureChecker::ErrIfNonScalarAssignmentStmt( - const parser::Variable &var, const parser::Expr &expr) { - // Err out if either the variable on the LHS or the expression on the RHS of - // the assignment statement are non-scalar (i.e. have rank > 0 or is of - // CHARACTER type) - const auto *e{GetExpr(context_, expr)}; - const auto *v{GetExpr(context_, var)}; - if (e && v) { - if (e->Rank() != 0 || - (e->GetType().has_value() && - e->GetType().value().category() == common::TypeCategory::Character)) - context_.Say(expr.source, - "Expected scalar expression " - "on the RHS of atomic assignment " - "statement"_err_en_US); - if (v->Rank() != 0 || - (v->GetType().has_value() && - v->GetType()->category() == common::TypeCategory::Character)) - context_.Say(var.GetSource(), - "Expected scalar variable " - "on the LHS of atomic assignment " - "statement"_err_en_US); - } -} - -template -bool OmpStructureChecker::IsOperatorValid(const T &node, const D &variable) { - using AllowedBinaryOperators = - std::variant; - using BinaryOperators = std::variant; - - if constexpr (common::HasMember) { - const auto &variableName{variable.GetSource().ToString()}; - const auto &exprLeft{std::get<0>(node.t)}; - const auto &exprRight{std::get<1>(node.t)}; - if ((exprLeft.value().source.ToString() != variableName) && - (exprRight.value().source.ToString() != variableName)) { - context_.Say(variable.GetSource(), - "Atomic update statement should be of form " - "`%s = %s operator expr` OR `%s = expr operator %s`"_err_en_US, - variableName, variableName, variableName, variableName); - } - return common::HasMember; +static std::pair SplitAssignmentSource( + parser::CharBlock source) { + // Find => in the range, if not found, find = that is not a part of + // <=, >=, ==, or /=. + auto trim{[](std::string_view v) { + const char *begin{v.data()}; + const char *end{begin + v.size()}; + while (*begin == ' ' && begin != end) { + ++begin; + } + while (begin != end && end[-1] == ' ') { + --end; + } + assert(begin != end && "Source should not be empty"); + return parser::CharBlock(begin, end - begin); + }}; + + std::string_view sv(source.begin(), source.size()); + + if (auto where{sv.find("=>")}; where != sv.npos) { + std::string_view lhs(sv.data(), where); + std::string_view rhs(sv.data() + where + 2, sv.size() - where - 2); + return std::make_pair(trim(lhs), trim(rhs)); } - return false; + + // Go backwards, since all the exclusions above end with a '='. + for (size_t next{source.size()}; next > 1; --next) { + if (sv[next - 1] == '=' && !llvm::is_contained("<>=/", sv[next - 2])) { + std::string_view lhs(sv.data(), next - 1); + std::string_view rhs(sv.data() + next, sv.size() - next); + return std::make_pair(trim(lhs), trim(rhs)); + } + } + llvm_unreachable("Could not find assignment operator"); } -void OmpStructureChecker::CheckAtomicCaptureStmt( - const parser::AssignmentStmt &assignmentStmt) { - const auto &var{std::get(assignmentStmt.t)}; - const auto &expr{std::get(assignmentStmt.t)}; - common::visit( - common::visitors{ - [&](const common::Indirection &designator) { - const auto *dataRef = - std::get_if(&designator.value().u); - const auto *name = - dataRef ? std::get_if(&dataRef->u) : nullptr; - if (name && IsAllocatable(*name->symbol)) - context_.Say(name->source, - "%s must not have ALLOCATABLE " - "attribute"_err_en_US, - name->ToString()); - }, - [&](const auto &) { - // Anything other than a `parser::Designator` is not allowed - context_.Say(expr.source, - "Expected scalar variable " - "of intrinsic type on RHS of atomic " - "assignment statement"_err_en_US); - }}, - expr.u); - ErrIfLHSAndRHSSymbolsMatch(var, expr); - ErrIfNonScalarAssignmentStmt(var, expr); -} - -void OmpStructureChecker::CheckAtomicWriteStmt( - const parser::AssignmentStmt &assignmentStmt) { - const auto &var{std::get(assignmentStmt.t)}; - const auto &expr{std::get(assignmentStmt.t)}; - ErrIfAllocatableVariable(var); - ErrIfLHSAndRHSSymbolsMatch(var, expr); - ErrIfNonScalarAssignmentStmt(var, expr); -} - -void OmpStructureChecker::CheckAtomicUpdateStmt( - const parser::AssignmentStmt &assignment) { - const auto &expr{std::get(assignment.t)}; - const auto &var{std::get(assignment.t)}; - bool isIntrinsicProcedure{false}; - bool isValidOperator{false}; - common::visit( - common::visitors{ - [&](const common::Indirection &x) { - isIntrinsicProcedure = true; - const auto &procedureDesignator{ - std::get(x.value().v.t)}; - const parser::Name *name{ - std::get_if(&procedureDesignator.u)}; - if (name && - !(name->source == "max" || name->source == "min" || - name->source == "iand" || name->source == "ior" || - name->source == "ieor")) { - context_.Say(expr.source, - "Invalid intrinsic procedure name in " - "OpenMP ATOMIC (UPDATE) statement"_err_en_US); - } - }, - [&](const auto &x) { - if (!IsOperatorValid(x, var)) { - context_.Say(expr.source, - "Invalid or missing operator in atomic update " - "statement"_err_en_US); - } else - isValidOperator = true; - }, - }, - expr.u); - if (const auto *e{GetExpr(context_, expr)}) { - const auto *v{GetExpr(context_, var)}; - if (e->Rank() != 0 || - (e->GetType().has_value() && - e->GetType().value().category() == common::TypeCategory::Character)) - context_.Say(expr.source, - "Expected scalar expression " - "on the RHS of atomic update assignment " - "statement"_err_en_US); - if (v->Rank() != 0 || - (v->GetType().has_value() && - v->GetType()->category() == common::TypeCategory::Character)) - context_.Say(var.GetSource(), - "Expected scalar variable " - "on the LHS of atomic update assignment " - "statement"_err_en_US); - auto vSyms{evaluate::GetSymbolVector(*v)}; - const Symbol &varSymbol = vSyms.front(); - int numOfSymbolMatches{0}; - SymbolVector exprSymbols{evaluate::GetSymbolVector(*e)}; - for (const Symbol &symbol : exprSymbols) { - if (varSymbol == symbol) { - numOfSymbolMatches++; +namespace atomic { + +struct DesignatorCollector : public evaluate::Traverse, false> { + using Result = std::vector; + using Base = evaluate::Traverse; + DesignatorCollector() : Base(*this) {} + + Result Default() const { return {}; } + + using Base::operator(); + + template // + Result operator()(const evaluate::Designator &x) const { + // Once in a designator, don't traverse it any further (i.e. only + // collect top-level designators). + auto copy{x}; + return Result{AsGenericExpr(std::move(copy))}; + } + + template // + Result Combine(Result &&result, Rs &&...results) const { + Result v(std::move(result)); + auto moveAppend{[](auto &accum, auto &&other) { + for (auto &&s : other) { + accum.push_back(std::move(s)); } + }}; + (moveAppend(v, std::move(results)), ...); + return v; + } +}; + +struct VariableFinder : public evaluate::AnyTraverse { + using Base = evaluate::AnyTraverse; + VariableFinder(const SomeExpr &v) : Base(*this), var(v) {} + + using Base::operator(); + + template + bool operator()(const evaluate::Designator &x) const { + auto copy{x}; + return evaluate::AsGenericExpr(std::move(copy)) == var; + } + + template + bool operator()(const evaluate::FunctionRef &x) const { + auto copy{x}; + return evaluate::AsGenericExpr(std::move(copy)) == var; + } + +private: + const SomeExpr &var; +}; +} // namespace atomic + +static bool IsPointerAssignment(const evaluate::Assignment &x) { + return std::holds_alternative(x.u) || + std::holds_alternative(x.u); +} + +static bool IsCheckForAssociated(const SomeExpr &cond) { + return GetTopLevelOperation(cond).first == operation::Operator::Associated; +} + +static bool HasCommonDesignatorSymbols( + const evaluate::SymbolVector &baseSyms, const SomeExpr &other) { + // Compare the designators used in "other" with the designators whose + // symbols are given in baseSyms. + // This is a part of the check if these two expressions can access the same + // storage: if the designators used in them are different enough, then they + // will be assumed not to access the same memory. + // + // Consider an (array element) expression x%y(w%z), the corresponding symbol + // vector will be {x, y, w, z} (i.e. the symbols for these names). + // Check whether this exact sequence appears anywhere in any the symbol + // vector for "other". This will be true for x(y) and x(y+1), so this is + // not a sufficient condition, but can be used to eliminate candidates + // before doing more exhaustive checks. + // + // If any of the symbols in this sequence are function names, assume that + // there is no storage overlap, mostly because it would be impossible in + // general to determine what storage the function will access. + // Note: if f is pure, then two calls to f will access the same storage + // when called with the same arguments. This check is not done yet. + + if (llvm::any_of( + baseSyms, [](const SymbolRef &s) { return s->IsSubprogram(); })) { + // If there is a function symbol in the chain then we can't infer much + // about the accessed storage. + return false; + } + + auto isSubsequence{// Is u a subsequence of v. + [](const evaluate::SymbolVector &u, const evaluate::SymbolVector &v) { + size_t us{u.size()}, vs{v.size()}; + if (us > vs) { + return false; + } + for (size_t off{0}; off != vs - us + 1; ++off) { + bool same{true}; + for (size_t i{0}; i != us; ++i) { + if (u[i] != v[off + i]) { + same = false; + break; + } + } + if (same) { + return true; + } + } + return false; + }}; + + evaluate::SymbolVector otherSyms{evaluate::GetSymbolVector(other)}; + return isSubsequence(baseSyms, otherSyms); +} + +static bool HasCommonTopLevelDesignators( + const std::vector &baseDsgs, const SomeExpr &other) { + // Compare designators directly as expressions. This will ensure + // that x(y) and x(y+1) are not flagged as overlapping, whereas + // the symbol vectors for both of these would be identical. + std::vector otherDsgs{atomic::DesignatorCollector{}(other)}; + + for (auto &s : baseDsgs) { + if (llvm::any_of(otherDsgs, [&](auto &&t) { return s == t; })) { + return true; } - if (isIntrinsicProcedure) { - std::string varName = var.GetSource().ToString(); - if (numOfSymbolMatches != 1) - context_.Say(expr.source, - "Intrinsic procedure" - " arguments in atomic update statement" - " must have exactly one occurence of '%s'"_err_en_US, - varName); - else if (varSymbol != exprSymbols.front() && - varSymbol != exprSymbols.back()) - context_.Say(expr.source, - "Atomic update statement " - "should be of the form `%s = intrinsic_procedure(%s, expr_list)` " - "OR `%s = intrinsic_procedure(expr_list, %s)`"_err_en_US, - varName, varName, varName, varName); - } else if (isValidOperator) { - if (numOfSymbolMatches != 1) - context_.Say(expr.source, - "Exactly one occurence of '%s' " - "expected on the RHS of atomic update assignment statement"_err_en_US, - var.GetSource().ToString()); + } + return false; +} + +static const SomeExpr *HasStorageOverlap( + const SomeExpr &base, llvm::ArrayRef exprs) { + evaluate::SymbolVector baseSyms{evaluate::GetSymbolVector(base)}; + std::vector baseDsgs{atomic::DesignatorCollector{}(base)}; + + for (const SomeExpr &expr : exprs) { + if (!HasCommonDesignatorSymbols(baseSyms, expr)) { + continue; + } + if (HasCommonTopLevelDesignators(baseDsgs, expr)) { + return &expr; } } + return nullptr; +} - ErrIfAllocatableVariable(var); +static bool IsMaybeAtomicWrite(const evaluate::Assignment &assign) { + // This ignores function calls, so it will accept "f(x) = f(x) + 1" + // for example. + return HasStorageOverlap(assign.lhs, assign.rhs) == nullptr; } -void OmpStructureChecker::CheckAtomicCompareConstruct( - const parser::OmpAtomicCompare &atomicCompareConstruct) { +static bool IsSubexpressionOf(const SomeExpr &sub, const SomeExpr &super) { + return atomic::VariableFinder{sub}(super); +} - // TODO: Check that the if-stmt is `if (var == expr) var = new` - // [with or without then/end-do] +static void SetExpr(parser::TypedExpr &expr, MaybeExpr value) { + if (value) { + expr.Reset(new evaluate::GenericExprWrapper(std::move(value)), + evaluate::GenericExprWrapper::Deleter); + } +} - unsigned version{context_.langOptions().OpenMPVersion}; - if (version < 51) { - context_.Say(atomicCompareConstruct.source, - "%s construct not allowed in %s, %s"_err_en_US, - atomicCompareConstruct.source, ThisVersion(version), TryVersion(51)); - } - - // TODO: More work needed here. Some of the Update restrictions need to - // be added, but Update isn't the same either. -} - -// TODO: Allow cond-update-stmt once compare clause is supported. -void OmpStructureChecker::CheckAtomicCaptureConstruct( - const parser::OmpAtomicCapture &atomicCaptureConstruct) { - const parser::AssignmentStmt &stmt1 = - std::get(atomicCaptureConstruct.t) - .v.statement; - const auto &stmt1Var{std::get(stmt1.t)}; - const auto &stmt1Expr{std::get(stmt1.t)}; - const auto *v1 = GetExpr(context_, stmt1Var); - const auto *e1 = GetExpr(context_, stmt1Expr); - - const parser::AssignmentStmt &stmt2 = - std::get(atomicCaptureConstruct.t) - .v.statement; - const auto &stmt2Var{std::get(stmt2.t)}; - const auto &stmt2Expr{std::get(stmt2.t)}; - const auto *v2 = GetExpr(context_, stmt2Var); - const auto *e2 = GetExpr(context_, stmt2Expr); - - if (e1 && v1 && e2 && v2) { - if (parser::CheckForSingleVariableOnRHS(stmt1)) { - CheckAtomicCaptureStmt(stmt1); - if (CheckForSymbolMatch(v2, e2)) { - // ATOMIC CAPTURE construct is of the form [capture-stmt, update-stmt] - CheckAtomicUpdateStmt(stmt2); +static void SetAssignment(parser::AssignmentStmt::TypedAssignment &assign, + std::optional value) { + if (value) { + assign.Reset(new evaluate::GenericAssignmentWrapper(std::move(value)), + evaluate::GenericAssignmentWrapper::Deleter); + } +} + +static parser::OpenMPAtomicConstruct::Analysis::Op MakeAtomicAnalysisOp( + int what, + const std::optional &maybeAssign = std::nullopt) { + parser::OpenMPAtomicConstruct::Analysis::Op operation; + operation.what = what; + SetAssignment(operation.assign, maybeAssign); + return operation; +} + +static parser::OpenMPAtomicConstruct::Analysis MakeAtomicAnalysis( + const SomeExpr &atom, const MaybeExpr &cond, + parser::OpenMPAtomicConstruct::Analysis::Op &&op0, + parser::OpenMPAtomicConstruct::Analysis::Op &&op1) { + // Defined in flang/include/flang/Parser/parse-tree.h + // + // struct Analysis { + // struct Kind { + // static constexpr int None = 0; + // static constexpr int Read = 1; + // static constexpr int Write = 2; + // static constexpr int Update = Read | Write; + // static constexpr int Action = 3; // Bits containing N, R, W, U + // static constexpr int IfTrue = 4; + // static constexpr int IfFalse = 8; + // static constexpr int Condition = 12; // Bits containing IfTrue, IfFalse + // }; + // struct Op { + // int what; + // TypedAssignment assign; + // }; + // TypedExpr atom, cond; + // Op op0, op1; + // }; + + parser::OpenMPAtomicConstruct::Analysis an; + SetExpr(an.atom, atom); + SetExpr(an.cond, cond); + an.op0 = std::move(op0); + an.op1 = std::move(op1); + return an; +} + +void OmpStructureChecker::CheckStorageOverlap(const SomeExpr &base, + llvm::ArrayRef> exprs, + parser::CharBlock source) { + if (auto *expr{HasStorageOverlap(base, exprs)}) { + context_.Say(source, + "Within atomic operation %s and %s access the same storage"_warn_en_US, + base.AsFortran(), expr->AsFortran()); + } +} + +void OmpStructureChecker::ErrorShouldBeVariable( + const MaybeExpr &expr, parser::CharBlock source) { + if (expr) { + context_.Say(source, "Atomic expression %s should be a variable"_err_en_US, + expr->AsFortran()); + } else { + context_.Say(source, "Atomic expression should be a variable"_err_en_US); + } +} + +/// Check if `expr` satisfies the following conditions for x and v: +/// +/// [6.0:189:10-12] +/// - x and v (as applicable) are either scalar variables or +/// function references with scalar data pointer result of non-character +/// intrinsic type or variables that are non-polymorphic scalar pointers +/// and any length type parameter must be constant. +void OmpStructureChecker::CheckAtomicType( + SymbolRef sym, parser::CharBlock source, std::string_view name) { + const DeclTypeSpec *typeSpec{sym->GetType()}; + if (!typeSpec) { + return; + } + + if (!IsPointer(sym)) { + using Category = DeclTypeSpec::Category; + Category cat{typeSpec->category()}; + if (cat == Category::Character) { + context_.Say(source, + "Atomic variable %s cannot have CHARACTER type"_err_en_US, name); + } else if (cat != Category::Numeric && cat != Category::Logical) { + context_.Say(source, + "Atomic variable %s should have an intrinsic type"_err_en_US, name); + } + return; + } + + // Variable is a pointer. + if (typeSpec->IsPolymorphic()) { + context_.Say(source, + "Atomic variable %s cannot be a pointer to a polymorphic type"_err_en_US, + name); + return; + } + + // Go over all length parameters, if any, and check if they are + // explicit. + if (const DerivedTypeSpec *derived{typeSpec->AsDerived()}) { + if (llvm::any_of(derived->parameters(), [](auto &&entry) { + // "entry" is a map entry + return entry.second.isLen() && !entry.second.isExplicit(); + })) { + context_.Say(source, + "Atomic variable %s is a pointer to a type with non-constant length parameter"_err_en_US, + name); + } + } +} + +void OmpStructureChecker::CheckAtomicVariable( + const SomeExpr &atom, parser::CharBlock source) { + if (atom.Rank() != 0) { + context_.Say(source, "Atomic variable %s should be a scalar"_err_en_US, + atom.AsFortran()); + } + + std::vector dsgs{atomic::DesignatorCollector{}(atom)}; + assert(dsgs.size() == 1 && "Should have a single top-level designator"); + evaluate::SymbolVector syms{evaluate::GetSymbolVector(dsgs.front())}; + + CheckAtomicType(syms.back(), source, atom.AsFortran()); + + if (IsAllocatable(syms.back()) && !IsArrayElement(atom)) { + context_.Say(source, "Atomic variable %s cannot be ALLOCATABLE"_err_en_US, + atom.AsFortran()); + } +} + +std::pair +OmpStructureChecker::CheckUpdateCapture( + const parser::ExecutionPartConstruct *ec1, + const parser::ExecutionPartConstruct *ec2, parser::CharBlock source) { + // Decide which statement is the atomic update and which is the capture. + // + // The two allowed cases are: + // x = ... atomic-var = ... + // ... = x capture-var = atomic-var (with optional converts) + // or + // ... = x capture-var = atomic-var (with optional converts) + // x = ... atomic-var = ... + // + // The case of 'a = b; b = a' is ambiguous, so pick the first one as capture + // (which makes more sense, as it captures the original value of the atomic + // variable). + // + // If the two statements don't fit these criteria, return a pair of default- + // constructed values. + using ReturnTy = std::pair; + + SourcedActionStmt act1{GetActionStmt(ec1)}; + SourcedActionStmt act2{GetActionStmt(ec2)}; + auto maybeAssign1{GetEvaluateAssignment(act1.stmt)}; + auto maybeAssign2{GetEvaluateAssignment(act2.stmt)}; + if (!maybeAssign1 || !maybeAssign2) { + if (!IsAssignment(act1.stmt) || !IsAssignment(act2.stmt)) { + context_.Say(source, + "ATOMIC UPDATE operation with CAPTURE should contain two assignments"_err_en_US); + } + return std::make_pair(nullptr, nullptr); + } + + auto as1{*maybeAssign1}, as2{*maybeAssign2}; + + auto isUpdateCapture{ + [](const evaluate::Assignment &u, const evaluate::Assignment &c) { + return IsSameOrConvertOf(c.rhs, u.lhs); + }}; + + // Do some checks that narrow down the possible choices for the update + // and the capture statements. This will help to emit better diagnostics. + // 1. An assignment could be an update (cbu) if the left-hand side is a + // subexpression of the right-hand side. + // 2. An assignment could be a capture (cbc) if the right-hand side is + // a variable (or a function ref), with potential type conversions. + bool cbu1{IsSubexpressionOf(as1.lhs, as1.rhs)}; // Can as1 be an update? + bool cbu2{IsSubexpressionOf(as2.lhs, as2.rhs)}; // Can as2 be an update? + bool cbc1{IsVarOrFunctionRef(GetConvertInput(as1.rhs))}; // Can 1 be capture? + bool cbc2{IsVarOrFunctionRef(GetConvertInput(as2.rhs))}; // Can 2 be capture? + + // We want to diagnose cases where both assignments cannot be an update, + // or both cannot be a capture, as well as cases where either assignment + // cannot be any of these two. + // + // If we organize these boolean values into a matrix + // |cbu1 cbu2| + // |cbc1 cbc2| + // then we want to diagnose cases where the matrix has a zero (i.e. "false") + // row or column, including the case where everything is zero. All these + // cases correspond to the determinant of the matrix being 0, which suggests + // that checking the det may be a convenient diagnostic check. There is only + // one additional case where the det is 0, which is when the matrix is all 1 + // ("true"). The "all true" case represents the situation where both + // assignments could be an update as well as a capture. On the other hand, + // whenever det != 0, the roles of the update and the capture can be + // unambiguously assigned to as1 and as2 [1]. + // + // [1] This can be easily verified by hand: there are 10 2x2 matrices with + // det = 0, leaving 6 cases where det != 0: + // 0 1 0 1 1 0 1 0 1 1 1 1 + // 1 0 1 1 0 1 1 1 0 1 1 0 + // In each case the classification is unambiguous. + + // |cbu1 cbu2| + // det |cbc1 cbc2| = cbu1*cbc2 - cbu2*cbc1 + int det{int(cbu1) * int(cbc2) - int(cbu2) * int(cbc1)}; + + auto errorCaptureShouldRead{[&](const parser::CharBlock &source, + const std::string &expr) { + context_.Say(source, + "In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read %s"_err_en_US, + expr); + }}; + + auto errorNeitherWorks{[&]() { + context_.Say(source, + "In ATOMIC UPDATE operation with CAPTURE neither statement could be the update or the capture"_err_en_US); + }}; + + auto makeSelectionFromDet{[&](int det) -> ReturnTy { + // If det != 0, then the checks unambiguously suggest a specific + // categorization. + // If det == 0, then this function should be called only if the + // checks haven't ruled out any possibility, i.e. when both assigments + // could still be either updates or captures. + if (det > 0) { + // as1 is update, as2 is capture + if (isUpdateCapture(as1, as2)) { + return std::make_pair(/*Update=*/ec1, /*Capture=*/ec2); } else { - // ATOMIC CAPTURE construct is of the form [capture-stmt, write-stmt] - CheckAtomicWriteStmt(stmt2); + errorCaptureShouldRead(act2.source, as1.lhs.AsFortran()); + return std::make_pair(nullptr, nullptr); } - if (!(*e1 == *v2)) { - context_.Say(stmt1Expr.source, - "Captured variable/array element/derived-type component %s expected to be assigned in the second statement of ATOMIC CAPTURE construct"_err_en_US, - stmt1Expr.source); + } else if (det < 0) { + // as2 is update, as1 is capture + if (isUpdateCapture(as2, as1)) { + return std::make_pair(/*Update=*/ec2, /*Capture=*/ec1); + } else { + errorCaptureShouldRead(act1.source, as2.lhs.AsFortran()); + return std::make_pair(nullptr, nullptr); + } + } else { + bool updateFirst{isUpdateCapture(as1, as2)}; + bool captureFirst{isUpdateCapture(as2, as1)}; + if (updateFirst && captureFirst) { + // If both assignment could be the update and both could be the + // capture, emit a warning about the ambiguity. + context_.Say(act1.source, + "In ATOMIC UPDATE operation with CAPTURE either statement could be the update and the capture, assuming the first one is the capture statement"_warn_en_US); + return std::make_pair(/*Update=*/ec2, /*Capture=*/ec1); } - } else if (CheckForSymbolMatch(v1, e1) && - parser::CheckForSingleVariableOnRHS(stmt2)) { - // ATOMIC CAPTURE construct is of the form [update-stmt, capture-stmt] - CheckAtomicUpdateStmt(stmt1); - CheckAtomicCaptureStmt(stmt2); - // Variable updated in stmt1 should be captured in stmt2 - if (!(*v1 == *e2)) { - context_.Say(stmt1Var.GetSource(), - "Updated variable/array element/derived-type component %s expected to be captured in the second statement of ATOMIC CAPTURE construct"_err_en_US, - stmt1Var.GetSource()); + if (updateFirst != captureFirst) { + const parser::ExecutionPartConstruct *upd{updateFirst ? ec1 : ec2}; + const parser::ExecutionPartConstruct *cap{captureFirst ? ec1 : ec2}; + return std::make_pair(upd, cap); } + assert(!updateFirst && !captureFirst); + errorNeitherWorks(); + return std::make_pair(nullptr, nullptr); + } + }}; + + if (det != 0 || (cbu1 && cbu2 && cbc1 && cbc2)) { + return makeSelectionFromDet(det); + } + assert(det == 0 && "Prior checks should have covered det != 0"); + + // If neither of the statements is an RMW update, it could still be a + // "write" update. Pretty much any assignment can be a write update, so + // recompute det with cbu1 = cbu2 = true. + if (int writeDet{int(cbc2) - int(cbc1)}; writeDet || (cbc1 && cbc2)) { + return makeSelectionFromDet(writeDet); + } + + // It's only errors from here on. + + if (!cbu1 && !cbu2 && !cbc1 && !cbc2) { + errorNeitherWorks(); + return std::make_pair(nullptr, nullptr); + } + + // The remaining cases are that + // - no candidate for update, or for capture, + // - one of the assigments cannot be anything. + + if (!cbu1 && !cbu2) { + context_.Say(source, + "In ATOMIC UPDATE operation with CAPTURE neither statement could be the update"_err_en_US); + return std::make_pair(nullptr, nullptr); + } else if (!cbc1 && !cbc2) { + context_.Say(source, + "In ATOMIC UPDATE operation with CAPTURE neither statement could be the capture"_err_en_US); + return std::make_pair(nullptr, nullptr); + } + + if ((!cbu1 && !cbc1) || (!cbu2 && !cbc2)) { + auto &src = (!cbu1 && !cbc1) ? act1.source : act2.source; + context_.Say(src, + "In ATOMIC UPDATE operation with CAPTURE the statement could be neither the update nor the capture"_err_en_US); + return std::make_pair(nullptr, nullptr); + } + + // All cases should have been covered. + llvm_unreachable("Unchecked condition"); +} + +void OmpStructureChecker::CheckAtomicCaptureAssignment( + const evaluate::Assignment &capture, const SomeExpr &atom, + parser::CharBlock source) { + auto [lsrc, rsrc]{SplitAssignmentSource(source)}; + const SomeExpr &cap{capture.lhs}; + + if (!IsVarOrFunctionRef(atom)) { + ErrorShouldBeVariable(atom, rsrc); + } else { + CheckAtomicVariable(atom, rsrc); + // This part should have been checked prior to calling this function. + assert(*GetConvertInput(capture.rhs) == atom && + "This cannot be a capture assignment"); + CheckStorageOverlap(atom, {cap}, source); + } +} + +void OmpStructureChecker::CheckAtomicReadAssignment( + const evaluate::Assignment &read, parser::CharBlock source) { + auto [lsrc, rsrc]{SplitAssignmentSource(source)}; + + if (auto maybe{GetConvertInput(read.rhs)}) { + const SomeExpr &atom{*maybe}; + + if (!IsVarOrFunctionRef(atom)) { + ErrorShouldBeVariable(atom, rsrc); } else { - context_.Say(stmt1Expr.source, - "Invalid ATOMIC CAPTURE construct statements. Expected one of [update-stmt, capture-stmt], [capture-stmt, update-stmt], or [capture-stmt, write-stmt]"_err_en_US); + CheckAtomicVariable(atom, rsrc); + CheckStorageOverlap(atom, {read.lhs}, source); } + } else { + ErrorShouldBeVariable(read.rhs, rsrc); } } -void OmpStructureChecker::CheckAtomicMemoryOrderClause( - const parser::OmpAtomicClauseList *leftHandClauseList, - const parser::OmpAtomicClauseList *rightHandClauseList) { - int numMemoryOrderClause{0}; - int numFailClause{0}; - auto checkForValidMemoryOrderClause = [&](const parser::OmpAtomicClauseList - *clauseList) { - for (const auto &clause : clauseList->v) { - if (std::get_if(&clause.u)) { - numFailClause++; - if (numFailClause > 1) { - context_.Say(clause.source, - "More than one FAIL clause not allowed on OpenMP ATOMIC construct"_err_en_US); - return; +void OmpStructureChecker::CheckAtomicWriteAssignment( + const evaluate::Assignment &write, parser::CharBlock source) { + // [6.0:190:13-15] + // A write structured block is write-statement, a write statement that has + // one of the following forms: + // x = expr + // x => expr + auto [lsrc, rsrc]{SplitAssignmentSource(source)}; + const SomeExpr &atom{write.lhs}; + + if (!IsVarOrFunctionRef(atom)) { + ErrorShouldBeVariable(atom, rsrc); + } else { + CheckAtomicVariable(atom, lsrc); + CheckStorageOverlap(atom, {write.rhs}, source); + } +} + +void OmpStructureChecker::CheckAtomicUpdateAssignment( + const evaluate::Assignment &update, parser::CharBlock source) { + // [6.0:191:1-7] + // An update structured block is update-statement, an update statement + // that has one of the following forms: + // x = x operator expr + // x = expr operator x + // x = intrinsic-procedure-name (x) + // x = intrinsic-procedure-name (x, expr-list) + // x = intrinsic-procedure-name (expr-list, x) + auto [lsrc, rsrc]{SplitAssignmentSource(source)}; + const SomeExpr &atom{update.lhs}; + + if (!IsVarOrFunctionRef(atom)) { + ErrorShouldBeVariable(atom, rsrc); + // Skip other checks. + return; + } + + CheckAtomicVariable(atom, lsrc); + + std::pair> top{ + operation::Operator::Unknown, {}}; + if (auto &&maybeInput{GetConvertInput(update.rhs)}) { + top = GetTopLevelOperation(*maybeInput); + } + switch (top.first) { + case operation::Operator::Add: + case operation::Operator::Sub: + case operation::Operator::Mul: + case operation::Operator::Div: + case operation::Operator::And: + case operation::Operator::Or: + case operation::Operator::Eqv: + case operation::Operator::Neqv: + case operation::Operator::Min: + case operation::Operator::Max: + case operation::Operator::Identity: + break; + case operation::Operator::Call: + context_.Say(source, + "A call to this function is not a valid ATOMIC UPDATE operation"_err_en_US); + return; + case operation::Operator::Convert: + context_.Say(source, + "An implicit or explicit type conversion is not a valid ATOMIC UPDATE operation"_err_en_US); + return; + case operation::Operator::Intrinsic: + context_.Say(source, + "This intrinsic function is not a valid ATOMIC UPDATE operation"_err_en_US); + return; + case operation::Operator::Constant: + case operation::Operator::Unknown: + context_.Say( + source, "This is not a valid ATOMIC UPDATE operation"_err_en_US); + return; + default: + assert( + top.first != operation::Operator::Identity && "Handle this separately"); + context_.Say(source, + "The %s operator is not a valid ATOMIC UPDATE operation"_err_en_US, + operation::ToString(top.first)); + return; + } + // Check if `atom` occurs exactly once in the argument list. + std::vector nonAtom; + auto unique{[&]() { // -> iterator + auto found{top.second.end()}; + for (auto i{top.second.begin()}, e{top.second.end()}; i != e; ++i) { + if (IsSameOrConvertOf(*i, atom)) { + if (found != top.second.end()) { + return top.second.end(); } + found = i; } else { - if (std::get_if(&clause.u)) { - numMemoryOrderClause++; - if (numMemoryOrderClause > 1) { - context_.Say(clause.source, - "More than one memory order clause not allowed on OpenMP ATOMIC construct"_err_en_US); - return; - } + nonAtom.push_back(*i); + } + } + return found; + }()}; + + if (unique == top.second.end()) { + if (top.first == operation::Operator::Identity) { + // This is "x = y". + context_.Say(rsrc, + "The atomic variable %s should appear as an argument in the update operation"_err_en_US, + atom.AsFortran()); + } else { + assert(top.first != operation::Operator::Identity && + "Handle this separately"); + context_.Say(rsrc, + "The atomic variable %s should occur exactly once among the arguments of the top-level %s operator"_err_en_US, + atom.AsFortran(), operation::ToString(top.first)); + } + } else { + CheckStorageOverlap(atom, nonAtom, source); + } +} + +void OmpStructureChecker::CheckAtomicConditionalUpdateAssignment( + const SomeExpr &cond, parser::CharBlock condSource, + const evaluate::Assignment &assign, parser::CharBlock assignSource) { + auto [alsrc, arsrc]{SplitAssignmentSource(assignSource)}; + const SomeExpr &atom{assign.lhs}; + + if (!IsVarOrFunctionRef(atom)) { + ErrorShouldBeVariable(atom, arsrc); + // Skip other checks. + return; + } + + CheckAtomicVariable(atom, alsrc); + + auto top{GetTopLevelOperation(cond)}; + // Missing arguments to operations would have been diagnosed by now. + + switch (top.first) { + case operation::Operator::Associated: + if (atom != top.second.front()) { + context_.Say(assignSource, + "The pointer argument to ASSOCIATED must be same as the target of the assignment"_err_en_US); + } + break; + // x equalop e | e equalop x (allowing "e equalop x" is an extension) + case operation::Operator::Eq: + case operation::Operator::Eqv: + // x ordop expr | expr ordop x + case operation::Operator::Lt: + case operation::Operator::Gt: { + const SomeExpr &arg0{top.second[0]}; + const SomeExpr &arg1{top.second[1]}; + if (IsSameOrConvertOf(arg0, atom)) { + CheckStorageOverlap(atom, {arg1}, condSource); + } else if (IsSameOrConvertOf(arg1, atom)) { + CheckStorageOverlap(atom, {arg0}, condSource); + } else { + assert(top.first != operation::Operator::Identity && + "Handle this separately"); + context_.Say(assignSource, + "An argument of the %s operator should be the target of the assignment"_err_en_US, + operation::ToString(top.first)); + } + break; + } + case operation::Operator::Identity: + case operation::Operator::True: + case operation::Operator::False: + break; + default: + assert( + top.first != operation::Operator::Identity && "Handle this separately"); + context_.Say(condSource, + "The %s operator is not a valid condition for ATOMIC operation"_err_en_US, + operation::ToString(top.first)); + break; + } +} + +void OmpStructureChecker::CheckAtomicConditionalUpdateStmt( + const AnalyzedCondStmt &update, parser::CharBlock source) { + // The condition/statements must be: + // - cond: x equalop e ift: x = d iff: - + // - cond: x ordop expr ift: x = expr iff: - (+ commute ordop) + // - cond: associated(x) ift: x => expr iff: - + // - cond: associated(x, e) ift: x => expr iff: - + + // The if-true statement must be present, and must be an assignment. + auto maybeAssign{GetEvaluateAssignment(update.ift.stmt)}; + if (!maybeAssign) { + if (update.ift.stmt && !IsAssignment(update.ift.stmt)) { + context_.Say(update.ift.source, + "In ATOMIC UPDATE COMPARE the update statement should be an assignment"_err_en_US); + } else { + context_.Say( + source, "Invalid body of ATOMIC UPDATE COMPARE operation"_err_en_US); + } + return; + } + const evaluate::Assignment assign{*maybeAssign}; + const SomeExpr &atom{assign.lhs}; + + CheckAtomicConditionalUpdateAssignment( + update.cond, update.source, assign, update.ift.source); + + CheckStorageOverlap(atom, {assign.rhs}, update.ift.source); + + if (update.iff) { + context_.Say(update.iff.source, + "In ATOMIC UPDATE COMPARE the update statement should not have an ELSE branch"_err_en_US); + } +} + +void OmpStructureChecker::CheckAtomicUpdateOnly( + const parser::OpenMPAtomicConstruct &x, const parser::Block &body, + parser::CharBlock source) { + if (body.size() == 1) { + SourcedActionStmt action{GetActionStmt(&body.front())}; + if (auto maybeUpdate{GetEvaluateAssignment(action.stmt)}) { + const SomeExpr &atom{maybeUpdate->lhs}; + CheckAtomicUpdateAssignment(*maybeUpdate, action.source); + + using Analysis = parser::OpenMPAtomicConstruct::Analysis; + x.analysis = MakeAtomicAnalysis(atom, std::nullopt, + MakeAtomicAnalysisOp(Analysis::Update, maybeUpdate), + MakeAtomicAnalysisOp(Analysis::None)); + } else if (!IsAssignment(action.stmt)) { + context_.Say( + source, "ATOMIC UPDATE operation should be an assignment"_err_en_US); + } + } else { + context_.Say(x.source, + "ATOMIC UPDATE operation should have a single statement"_err_en_US); + } +} + +void OmpStructureChecker::CheckAtomicConditionalUpdate( + const parser::OpenMPAtomicConstruct &x, const parser::Block &body, + parser::CharBlock source) { + // Allowable forms are (single-statement): + // - if ... + // - x = (... ? ... : x) + // and two-statement: + // - r = cond ; if (r) ... + + const parser::ExecutionPartConstruct *ust{nullptr}; // update + const parser::ExecutionPartConstruct *cst{nullptr}; // condition + + if (body.size() == 1) { + ust = &body.front(); + } else if (body.size() == 2) { + cst = &body.front(); + ust = &body.back(); + } else { + context_.Say(source, + "ATOMIC UPDATE COMPARE operation should contain one or two statements"_err_en_US); + return; + } + + // Flang doesn't support conditional-expr yet, so all update statements + // are if-statements. + + // IfStmt: if (...) ... + // IfConstruct: if (...) then ... endif + auto maybeUpdate{AnalyzeConditionalStmt(ust)}; + if (!maybeUpdate) { + context_.Say(source, + "In ATOMIC UPDATE COMPARE the update statement should be a conditional statement"_err_en_US); + return; + } + + AnalyzedCondStmt &update{*maybeUpdate}; + + if (SourcedActionStmt action{GetActionStmt(cst)}) { + // The "condition" statement must be `r = cond`. + if (auto maybeCond{GetEvaluateAssignment(action.stmt)}) { + if (maybeCond->lhs != update.cond) { + context_.Say(update.source, + "In ATOMIC UPDATE COMPARE the conditional statement must use %s as the condition"_err_en_US, + maybeCond->lhs.AsFortran()); + } else { + // If it's "r = ...; if (r) ..." then put the original condition + // in `update`. + update.cond = maybeCond->rhs; + } + } else { + context_.Say(action.source, + "In ATOMIC UPDATE COMPARE with two statements the first statement should compute the condition"_err_en_US); + } + } + + evaluate::Assignment assign{*GetEvaluateAssignment(update.ift.stmt)}; + + CheckAtomicConditionalUpdateStmt(update, source); + if (IsCheckForAssociated(update.cond)) { + if (!IsPointerAssignment(assign)) { + context_.Say(source, + "The assignment should be a pointer-assignment when the condition is ASSOCIATED"_err_en_US); + } + } else { + if (IsPointerAssignment(assign)) { + context_.Say(source, + "The assignment cannot be a pointer-assignment except when the condition is ASSOCIATED"_err_en_US); + } + } + + using Analysis = parser::OpenMPAtomicConstruct::Analysis; + x.analysis = MakeAtomicAnalysis(assign.lhs, update.cond, + MakeAtomicAnalysisOp(Analysis::Update | Analysis::IfTrue, assign), + MakeAtomicAnalysisOp(Analysis::None)); +} + +void OmpStructureChecker::CheckAtomicUpdateCapture( + const parser::OpenMPAtomicConstruct &x, const parser::Block &body, + parser::CharBlock source) { + if (body.size() != 2) { + context_.Say(source, + "ATOMIC UPDATE operation with CAPTURE should contain two statements"_err_en_US); + return; + } + + auto [uec, cec]{CheckUpdateCapture(&body.front(), &body.back(), source)}; + if (!uec || !cec) { + // Diagnostics already emitted. + return; + } + SourcedActionStmt uact{GetActionStmt(uec)}; + SourcedActionStmt cact{GetActionStmt(cec)}; + // The "dereferences" of std::optional are guaranteed to be valid after + // CheckUpdateCapture. + evaluate::Assignment update{*GetEvaluateAssignment(uact.stmt)}; + evaluate::Assignment capture{*GetEvaluateAssignment(cact.stmt)}; + + const SomeExpr &atom{update.lhs}; + + using Analysis = parser::OpenMPAtomicConstruct::Analysis; + int action; + + if (IsMaybeAtomicWrite(update)) { + action = Analysis::Write; + CheckAtomicWriteAssignment(update, uact.source); + } else { + action = Analysis::Update; + CheckAtomicUpdateAssignment(update, uact.source); + } + CheckAtomicCaptureAssignment(capture, atom, cact.source); + + if (IsPointerAssignment(update) != IsPointerAssignment(capture)) { + context_.Say(cact.source, + "The update and capture assignments should both be pointer-assignments or both be non-pointer-assignments"_err_en_US); + return; + } + + if (GetActionStmt(&body.front()).stmt == uact.stmt) { + x.analysis = MakeAtomicAnalysis(atom, std::nullopt, + MakeAtomicAnalysisOp(action, update), + MakeAtomicAnalysisOp(Analysis::Read, capture)); + } else { + x.analysis = MakeAtomicAnalysis(atom, std::nullopt, + MakeAtomicAnalysisOp(Analysis::Read, capture), + MakeAtomicAnalysisOp(action, update)); + } +} + +void OmpStructureChecker::CheckAtomicConditionalUpdateCapture( + const parser::OpenMPAtomicConstruct &x, const parser::Block &body, + parser::CharBlock source) { + // There are two different variants of this: + // (1) conditional-update and capture separately: + // This form only allows single-statement updates, i.e. the update + // form "r = cond; if (r) ..." is not allowed. + // (2) conditional-update combined with capture in a single statement: + // This form does allow the condition to be calculated separately, + // i.e. "r = cond; if (r) ...". + // Regardless of what form it is, the actual update assignment is a + // proper write, i.e. "x = d", where d does not depend on x. + + AnalyzedCondStmt update; + SourcedActionStmt capture; + bool captureAlways{true}, captureFirst{true}; + + auto extractCapture{[&]() { + capture = update.iff; + captureAlways = false; + update.iff = SourcedActionStmt{}; + }}; + + auto classifyNonUpdate{[&](const SourcedActionStmt &action) { + // The non-update statement is either "r = cond" or the capture. + if (auto maybeAssign{GetEvaluateAssignment(action.stmt)}) { + if (update.cond == maybeAssign->lhs) { + // If this is "r = cond; if (r) ...", then update the condition. + update.cond = maybeAssign->rhs; + update.source = action.source; + // In this form, the update and the capture are combined into + // an IF-THEN-ELSE statement. + extractCapture(); + } else { + // Assume this is the capture-statement. + capture = action; + } + } + }}; + + if (body.size() == 2) { + // This could be + // - capture; conditional-update (in any order), or + // - r = cond; if (r) capture-update + const parser::ExecutionPartConstruct *st1{&body.front()}; + const parser::ExecutionPartConstruct *st2{&body.back()}; + // In either case, the conditional statement can be analyzed by + // AnalyzeConditionalStmt, whereas the other statement cannot. + if (auto maybeUpdate1{AnalyzeConditionalStmt(st1)}) { + update = *maybeUpdate1; + classifyNonUpdate(GetActionStmt(st2)); + captureFirst = false; + } else if (auto maybeUpdate2{AnalyzeConditionalStmt(st2)}) { + update = *maybeUpdate2; + classifyNonUpdate(GetActionStmt(st1)); + } else { + // None of the statements are conditional, this rules out the + // "r = cond; if (r) ..." and the "capture + conditional-update" + // variants. This could still be capture + write (which is classified + // as conditional-update-capture in the spec). + auto [uec, cec]{CheckUpdateCapture(st1, st2, source)}; + if (!uec || !cec) { + // Diagnostics already emitted. + return; + } + SourcedActionStmt uact{GetActionStmt(uec)}; + SourcedActionStmt cact{GetActionStmt(cec)}; + update.ift = uact; + capture = cact; + if (uec == st1) { + captureFirst = false; + } + } + } else if (body.size() == 1) { + if (auto maybeUpdate{AnalyzeConditionalStmt(&body.front())}) { + update = *maybeUpdate; + // This is the form with update and capture combined into an IF-THEN-ELSE + // statement. The capture-statement is always the ELSE branch. + extractCapture(); + } else { + goto invalid; + } + } else { + context_.Say(source, + "ATOMIC UPDATE COMPARE CAPTURE operation should contain one or two statements"_err_en_US); + return; + invalid: + context_.Say(source, + "Invalid body of ATOMIC UPDATE COMPARE CAPTURE operation"_err_en_US); + return; + } + + // The update must have a form `x = d` or `x => d`. + if (auto maybeWrite{GetEvaluateAssignment(update.ift.stmt)}) { + const SomeExpr &atom{maybeWrite->lhs}; + CheckAtomicWriteAssignment(*maybeWrite, update.ift.source); + if (auto maybeCapture{GetEvaluateAssignment(capture.stmt)}) { + CheckAtomicCaptureAssignment(*maybeCapture, atom, capture.source); + + if (IsPointerAssignment(*maybeWrite) != + IsPointerAssignment(*maybeCapture)) { + context_.Say(capture.source, + "The update and capture assignments should both be pointer-assignments or both be non-pointer-assignments"_err_en_US); + return; + } + } else { + if (!IsAssignment(capture.stmt)) { + context_.Say(capture.source, + "In ATOMIC UPDATE COMPARE CAPTURE the capture statement should be an assignment"_err_en_US); + } + return; + } + } else { + if (!IsAssignment(update.ift.stmt)) { + context_.Say(update.ift.source, + "In ATOMIC UPDATE COMPARE CAPTURE the update statement should be an assignment"_err_en_US); + } + return; + } + + // update.iff should be empty here, the capture statement should be + // stored in "capture". + + // Fill out the analysis in the AST node. + using Analysis = parser::OpenMPAtomicConstruct::Analysis; + bool condUnused{std::visit( + [](auto &&s) { + using BareS = llvm::remove_cvref_t; + if constexpr (std::is_same_v) { + return true; + } else { + return false; } + }, + update.cond.u)}; + + int updateWhen{!condUnused ? Analysis::IfTrue : 0}; + int captureWhen{!captureAlways ? Analysis::IfFalse : 0}; + + evaluate::Assignment updAssign{*GetEvaluateAssignment(update.ift.stmt)}; + evaluate::Assignment capAssign{*GetEvaluateAssignment(capture.stmt)}; + + if (captureFirst) { + x.analysis = MakeAtomicAnalysis(updAssign.lhs, update.cond, + MakeAtomicAnalysisOp(Analysis::Read | captureWhen, capAssign), + MakeAtomicAnalysisOp(Analysis::Write | updateWhen, updAssign)); + } else { + x.analysis = MakeAtomicAnalysis(updAssign.lhs, update.cond, + MakeAtomicAnalysisOp(Analysis::Write | updateWhen, updAssign), + MakeAtomicAnalysisOp(Analysis::Read | captureWhen, capAssign)); + } +} + +void OmpStructureChecker::CheckAtomicRead( + const parser::OpenMPAtomicConstruct &x) { + // [6.0:190:5-7] + // A read structured block is read-statement, a read statement that has one + // of the following forms: + // v = x + // v => x + auto &dirSpec{std::get(x.t)}; + auto &block{std::get(x.t)}; + + // Read cannot be conditional or have a capture statement. + if (x.IsCompare() || x.IsCapture()) { + context_.Say(dirSpec.source, + "ATOMIC READ cannot have COMPARE or CAPTURE clauses"_err_en_US); + return; + } + + const parser::Block &body{GetInnermostExecPart(block)}; + + if (body.size() == 1) { + SourcedActionStmt action{GetActionStmt(&body.front())}; + if (auto maybeRead{GetEvaluateAssignment(action.stmt)}) { + CheckAtomicReadAssignment(*maybeRead, action.source); + + if (auto maybe{GetConvertInput(maybeRead->rhs)}) { + const SomeExpr &atom{*maybe}; + using Analysis = parser::OpenMPAtomicConstruct::Analysis; + x.analysis = MakeAtomicAnalysis(atom, std::nullopt, + MakeAtomicAnalysisOp(Analysis::Read, maybeRead), + MakeAtomicAnalysisOp(Analysis::None)); } + } else if (!IsAssignment(action.stmt)) { + context_.Say( + x.source, "ATOMIC READ operation should be an assignment"_err_en_US); } - }; - if (leftHandClauseList) { - checkForValidMemoryOrderClause(leftHandClauseList); + } else { + context_.Say(x.source, + "ATOMIC READ operation should have a single statement"_err_en_US); + } +} + +void OmpStructureChecker::CheckAtomicWrite( + const parser::OpenMPAtomicConstruct &x) { + auto &dirSpec{std::get(x.t)}; + auto &block{std::get(x.t)}; + + // Write cannot be conditional or have a capture statement. + if (x.IsCompare() || x.IsCapture()) { + context_.Say(dirSpec.source, + "ATOMIC WRITE cannot have COMPARE or CAPTURE clauses"_err_en_US); + return; } - if (rightHandClauseList) { - checkForValidMemoryOrderClause(rightHandClauseList); + + const parser::Block &body{GetInnermostExecPart(block)}; + + if (body.size() == 1) { + SourcedActionStmt action{GetActionStmt(&body.front())}; + if (auto maybeWrite{GetEvaluateAssignment(action.stmt)}) { + const SomeExpr &atom{maybeWrite->lhs}; + CheckAtomicWriteAssignment(*maybeWrite, action.source); + + using Analysis = parser::OpenMPAtomicConstruct::Analysis; + x.analysis = MakeAtomicAnalysis(atom, std::nullopt, + MakeAtomicAnalysisOp(Analysis::Write, maybeWrite), + MakeAtomicAnalysisOp(Analysis::None)); + } else if (!IsAssignment(action.stmt)) { + context_.Say( + x.source, "ATOMIC WRITE operation should be an assignment"_err_en_US); + } + } else { + context_.Say(x.source, + "ATOMIC WRITE operation should have a single statement"_err_en_US); + } +} + +void OmpStructureChecker::CheckAtomicUpdate( + const parser::OpenMPAtomicConstruct &x) { + auto &block{std::get(x.t)}; + + bool isConditional{x.IsCompare()}; + bool isCapture{x.IsCapture()}; + const parser::Block &body{GetInnermostExecPart(block)}; + + if (isConditional && isCapture) { + CheckAtomicConditionalUpdateCapture(x, body, x.source); + } else if (isConditional) { + CheckAtomicConditionalUpdate(x, body, x.source); + } else if (isCapture) { + CheckAtomicUpdateCapture(x, body, x.source); + } else { // update-only + CheckAtomicUpdateOnly(x, body, x.source); } } void OmpStructureChecker::Enter(const parser::OpenMPAtomicConstruct &x) { - common::visit( - common::visitors{ - [&](const parser::OmpAtomic &atomicConstruct) { - const auto &dir{std::get(atomicConstruct.t)}; - PushContextAndClauseSets( - dir.source, llvm::omp::Directive::OMPD_atomic); - CheckAtomicUpdateStmt( - std::get>( - atomicConstruct.t) - .statement); - CheckAtomicMemoryOrderClause( - &std::get(atomicConstruct.t), - nullptr); - CheckHintClause( - &std::get(atomicConstruct.t), - nullptr, "ATOMIC"); - }, - [&](const parser::OmpAtomicUpdate &atomicUpdate) { - const auto &dir{std::get(atomicUpdate.t)}; - PushContextAndClauseSets( - dir.source, llvm::omp::Directive::OMPD_atomic); - CheckAtomicUpdateStmt( - std::get>( - atomicUpdate.t) - .statement); - CheckAtomicMemoryOrderClause( - &std::get<0>(atomicUpdate.t), &std::get<2>(atomicUpdate.t)); - CheckHintClause( - &std::get<0>(atomicUpdate.t), &std::get<2>(atomicUpdate.t), - "UPDATE"); - }, - [&](const parser::OmpAtomicRead &atomicRead) { - const auto &dir{std::get(atomicRead.t)}; - PushContextAndClauseSets( - dir.source, llvm::omp::Directive::OMPD_atomic); - CheckAtomicMemoryOrderClause( - &std::get<0>(atomicRead.t), &std::get<2>(atomicRead.t)); - CheckHintClause( - &std::get<0>(atomicRead.t), &std::get<2>(atomicRead.t), "READ"); - CheckAtomicCaptureStmt( - std::get>( - atomicRead.t) - .statement); - }, - [&](const parser::OmpAtomicWrite &atomicWrite) { - const auto &dir{std::get(atomicWrite.t)}; - PushContextAndClauseSets( - dir.source, llvm::omp::Directive::OMPD_atomic); - CheckAtomicMemoryOrderClause( - &std::get<0>(atomicWrite.t), &std::get<2>(atomicWrite.t)); - CheckHintClause( - &std::get<0>(atomicWrite.t), &std::get<2>(atomicWrite.t), - "WRITE"); - CheckAtomicWriteStmt( - std::get>( - atomicWrite.t) - .statement); - }, - [&](const parser::OmpAtomicCapture &atomicCapture) { - const auto &dir{std::get(atomicCapture.t)}; - PushContextAndClauseSets( - dir.source, llvm::omp::Directive::OMPD_atomic); - CheckAtomicMemoryOrderClause( - &std::get<0>(atomicCapture.t), &std::get<2>(atomicCapture.t)); - CheckHintClause( - &std::get<0>(atomicCapture.t), &std::get<2>(atomicCapture.t), - "CAPTURE"); - CheckAtomicCaptureConstruct(atomicCapture); - }, - [&](const parser::OmpAtomicCompare &atomicCompare) { - const auto &dir{std::get(atomicCompare.t)}; - PushContextAndClauseSets( - dir.source, llvm::omp::Directive::OMPD_atomic); - CheckAtomicMemoryOrderClause( - &std::get<0>(atomicCompare.t), &std::get<2>(atomicCompare.t)); - CheckHintClause( - &std::get<0>(atomicCompare.t), &std::get<2>(atomicCompare.t), - "CAPTURE"); - CheckAtomicCompareConstruct(atomicCompare); - }, - }, - x.u); + // All of the following groups have the "exclusive" property, i.e. at + // most one clause from each group is allowed. + // The exclusivity-checking code should eventually be unified for all + // clauses, with clause groups defined in OMP.td. + std::array atomic{llvm::omp::Clause::OMPC_read, + llvm::omp::Clause::OMPC_update, llvm::omp::Clause::OMPC_write}; + std::array memoryOrder{llvm::omp::Clause::OMPC_acq_rel, + llvm::omp::Clause::OMPC_acquire, llvm::omp::Clause::OMPC_relaxed, + llvm::omp::Clause::OMPC_release, llvm::omp::Clause::OMPC_seq_cst}; + + auto checkExclusive{[&](llvm::ArrayRef group, + std::string_view name, + const parser::OmpClauseList &clauses) { + const parser::OmpClause *present{nullptr}; + for (const parser::OmpClause &clause : clauses.v) { + llvm::omp::Clause id{clause.Id()}; + if (!llvm::is_contained(group, id)) { + continue; + } + if (present == nullptr) { + present = &clause; + continue; + } else if (id == present->Id()) { + // Ignore repetitions of the same clause, those will be diagnosed + // separately. + continue; + } + parser::MessageFormattedText txt( + "At most one clause from the '%s' group is allowed on ATOMIC construct"_err_en_US, + name.data()); + parser::Message message(clause.source, txt); + message.Attach(present->source, + "Previous clause from this group provided here"_en_US); + context_.Say(std::move(message)); + return; + } + }}; + + auto &dirSpec{std::get(x.t)}; + auto &dir{std::get(dirSpec.t)}; + PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_atomic); + llvm::omp::Clause kind{x.GetKind()}; + + checkExclusive(atomic, "atomic", dirSpec.Clauses()); + checkExclusive(memoryOrder, "memory-order", dirSpec.Clauses()); + + switch (kind) { + case llvm::omp::Clause::OMPC_read: + CheckAtomicRead(x); + break; + case llvm::omp::Clause::OMPC_write: + CheckAtomicWrite(x); + break; + case llvm::omp::Clause::OMPC_update: + CheckAtomicUpdate(x); + break; + default: + break; + } } void OmpStructureChecker::Leave(const parser::OpenMPAtomicConstruct &) { @@ -3332,7 +4335,6 @@ CHECK_SIMPLE_CLAUSE(Final, OMPC_final) CHECK_SIMPLE_CLAUSE(Flush, OMPC_flush) CHECK_SIMPLE_CLAUSE(Full, OMPC_full) CHECK_SIMPLE_CLAUSE(Grainsize, OMPC_grainsize) -CHECK_SIMPLE_CLAUSE(Hint, OMPC_hint) CHECK_SIMPLE_CLAUSE(Holds, OMPC_holds) CHECK_SIMPLE_CLAUSE(Inclusive, OMPC_inclusive) CHECK_SIMPLE_CLAUSE(Initializer, OMPC_initializer) @@ -4014,40 +5016,6 @@ void OmpStructureChecker::CheckIsLoopIvPartOfClause( } } } -// Following clauses have a separate node in parse-tree.h. -// Atomic-clause -CHECK_SIMPLE_PARSER_CLAUSE(OmpAtomicRead, OMPC_read) -CHECK_SIMPLE_PARSER_CLAUSE(OmpAtomicWrite, OMPC_write) -CHECK_SIMPLE_PARSER_CLAUSE(OmpAtomicUpdate, OMPC_update) -CHECK_SIMPLE_PARSER_CLAUSE(OmpAtomicCapture, OMPC_capture) - -void OmpStructureChecker::Leave(const parser::OmpAtomicRead &) { - CheckNotAllowedIfClause(llvm::omp::Clause::OMPC_read, - {llvm::omp::Clause::OMPC_release, llvm::omp::Clause::OMPC_acq_rel}); -} - -void OmpStructureChecker::Leave(const parser::OmpAtomicWrite &) { - CheckNotAllowedIfClause(llvm::omp::Clause::OMPC_write, - {llvm::omp::Clause::OMPC_acquire, llvm::omp::Clause::OMPC_acq_rel}); -} - -void OmpStructureChecker::Leave(const parser::OmpAtomicUpdate &) { - CheckNotAllowedIfClause(llvm::omp::Clause::OMPC_update, - {llvm::omp::Clause::OMPC_acquire, llvm::omp::Clause::OMPC_acq_rel}); -} - -// OmpAtomic node represents atomic directive without atomic-clause. -// atomic-clause - READ,WRITE,UPDATE,CAPTURE. -void OmpStructureChecker::Leave(const parser::OmpAtomic &) { - if (const auto *clause{FindClause(llvm::omp::Clause::OMPC_acquire)}) { - context_.Say(clause->source, - "Clause ACQUIRE is not allowed on the ATOMIC directive"_err_en_US); - } - if (const auto *clause{FindClause(llvm::omp::Clause::OMPC_acq_rel)}) { - context_.Say(clause->source, - "Clause ACQ_REL is not allowed on the ATOMIC directive"_err_en_US); - } -} // Restrictions specific to each clause are implemented apart from the // generalized restrictions. @@ -5026,21 +5994,6 @@ void OmpStructureChecker::Leave(const parser::OmpContextSelector &) { ExitDirectiveNest(ContextSelectorNest); } -std::optional OmpStructureChecker::GetDynamicType( - const common::Indirection &parserExpr) { - // Indirection parserExpr - // `- parser::Expr ^.value() - const parser::TypedExpr &typedExpr{parserExpr.value().typedExpr}; - // ForwardOwningPointer typedExpr - // `- GenericExprWrapper ^.get() - // `- std::optional ^->v - if (auto maybeExpr{typedExpr.get()->v}) { - return maybeExpr->GetType(); - } else { - return std::nullopt; - } -} - const std::list & OmpStructureChecker::GetTraitPropertyList( const parser::OmpTraitSelector &trait) { @@ -5430,7 +6383,7 @@ void OmpStructureChecker::CheckTraitCondition( const parser::OmpTraitProperty &property{properties.front()}; auto &scalarExpr{std::get(property.u)}; - auto maybeType{GetDynamicType(scalarExpr.thing)}; + auto maybeType{GetDynamicType(scalarExpr.thing.value())}; if (!maybeType || maybeType->category() != TypeCategory::Logical) { context_.Say(property.source, "%s trait requires a single LOGICAL expression"_err_en_US, diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h index 1a8059d8548ed..2074ec611dc2a 100644 --- a/flang/lib/Semantics/check-omp-structure.h +++ b/flang/lib/Semantics/check-omp-structure.h @@ -48,6 +48,7 @@ static const OmpDirectiveSet noWaitClauseNotAllowedSet{ } // namespace llvm namespace Fortran::semantics { +struct AnalyzedCondStmt; // Mapping from 'Symbol' to 'Source' to keep track of the variables // used in multiple clauses @@ -144,15 +145,6 @@ class OmpStructureChecker void Leave(const parser::OmpClauseList &); void Enter(const parser::OmpClause &); - void Enter(const parser::OmpAtomicRead &); - void Leave(const parser::OmpAtomicRead &); - void Enter(const parser::OmpAtomicWrite &); - void Leave(const parser::OmpAtomicWrite &); - void Enter(const parser::OmpAtomicUpdate &); - void Leave(const parser::OmpAtomicUpdate &); - void Enter(const parser::OmpAtomicCapture &); - void Leave(const parser::OmpAtomic &); - void Enter(const parser::DoConstruct &); void Leave(const parser::DoConstruct &); @@ -192,8 +184,6 @@ class OmpStructureChecker void CheckAllowedMapTypes(const parser::OmpMapType::Value &, const std::list &); - std::optional GetDynamicType( - const common::Indirection &); const std::list &GetTraitPropertyList( const parser::OmpTraitSelector &); std::optional GetClauseFromProperty( @@ -265,14 +255,44 @@ class OmpStructureChecker void CheckDoWhile(const parser::OpenMPLoopConstruct &x); void CheckAssociatedLoopConstraints(const parser::OpenMPLoopConstruct &x); template bool IsOperatorValid(const T &, const D &); - void CheckAtomicMemoryOrderClause( - const parser::OmpAtomicClauseList *, const parser::OmpAtomicClauseList *); - void CheckAtomicUpdateStmt(const parser::AssignmentStmt &); - void CheckAtomicCaptureStmt(const parser::AssignmentStmt &); - void CheckAtomicWriteStmt(const parser::AssignmentStmt &); - void CheckAtomicCaptureConstruct(const parser::OmpAtomicCapture &); - void CheckAtomicCompareConstruct(const parser::OmpAtomicCompare &); - void CheckAtomicConstructStructure(const parser::OpenMPAtomicConstruct &); + + void CheckStorageOverlap(const evaluate::Expr &, + llvm::ArrayRef>, parser::CharBlock); + void ErrorShouldBeVariable(const MaybeExpr &expr, parser::CharBlock source); + void CheckAtomicType( + SymbolRef sym, parser::CharBlock source, std::string_view name); + void CheckAtomicVariable( + const evaluate::Expr &, parser::CharBlock); + std::pair + CheckUpdateCapture(const parser::ExecutionPartConstruct *ec1, + const parser::ExecutionPartConstruct *ec2, parser::CharBlock source); + void CheckAtomicCaptureAssignment(const evaluate::Assignment &capture, + const SomeExpr &atom, parser::CharBlock source); + void CheckAtomicReadAssignment( + const evaluate::Assignment &read, parser::CharBlock source); + void CheckAtomicWriteAssignment( + const evaluate::Assignment &write, parser::CharBlock source); + void CheckAtomicUpdateAssignment( + const evaluate::Assignment &update, parser::CharBlock source); + void CheckAtomicConditionalUpdateAssignment(const SomeExpr &cond, + parser::CharBlock condSource, const evaluate::Assignment &assign, + parser::CharBlock assignSource); + void CheckAtomicConditionalUpdateStmt( + const AnalyzedCondStmt &update, parser::CharBlock source); + void CheckAtomicUpdateOnly(const parser::OpenMPAtomicConstruct &x, + const parser::Block &body, parser::CharBlock source); + void CheckAtomicConditionalUpdate(const parser::OpenMPAtomicConstruct &x, + const parser::Block &body, parser::CharBlock source); + void CheckAtomicUpdateCapture(const parser::OpenMPAtomicConstruct &x, + const parser::Block &body, parser::CharBlock source); + void CheckAtomicConditionalUpdateCapture( + const parser::OpenMPAtomicConstruct &x, const parser::Block &body, + parser::CharBlock source); + void CheckAtomicRead(const parser::OpenMPAtomicConstruct &x); + void CheckAtomicWrite(const parser::OpenMPAtomicConstruct &x); + void CheckAtomicUpdate(const parser::OpenMPAtomicConstruct &x); + void CheckDistLinear(const parser::OpenMPLoopConstruct &x); void CheckSIMDNest(const parser::OpenMPConstruct &x); void CheckTargetNest(const parser::OpenMPConstruct &x); @@ -324,7 +344,6 @@ class OmpStructureChecker void EnterDirectiveNest(const int index) { directiveNest_[index]++; } void ExitDirectiveNest(const int index) { directiveNest_[index]--; } int GetDirectiveNest(const int index) { return directiveNest_[index]; } - template void CheckHintClause(D *, D *, std::string_view); inline void ErrIfAllocatableVariable(const parser::Variable &); inline void ErrIfLHSAndRHSSymbolsMatch( const parser::Variable &, const parser::Expr &); diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp index 3e133b156a9f3..7db447aee0026 100644 --- a/flang/lib/Semantics/resolve-names.cpp +++ b/flang/lib/Semantics/resolve-names.cpp @@ -1684,11 +1684,8 @@ class OmpVisitor : public virtual DeclarationVisitor { messageHandler().set_currStmtSource(std::nullopt); } bool Pre(const parser::OpenMPAtomicConstruct &x) { - return common::visit(common::visitors{[&](const auto &u) -> bool { - AddOmpSourceRange(u.source); - return true; - }}, - x.u); + AddOmpSourceRange(x.source); + return true; } void Post(const parser::OpenMPAtomicConstruct &) { messageHandler().set_currStmtSource(std::nullopt); diff --git a/flang/lib/Semantics/rewrite-directives.cpp b/flang/lib/Semantics/rewrite-directives.cpp index 104a77885d276..b4fef2c881b67 100644 --- a/flang/lib/Semantics/rewrite-directives.cpp +++ b/flang/lib/Semantics/rewrite-directives.cpp @@ -51,23 +51,21 @@ class OmpRewriteMutator : public DirectiveRewriteMutator { bool OmpRewriteMutator::Pre(parser::OpenMPAtomicConstruct &x) { // Find top-level parent of the operation. - Symbol *topLevelParent{common::visit( - [&](auto &atomic) { - Symbol *symbol{nullptr}; - Scope *scope{ - &context_.FindScope(std::get(atomic.t).source)}; - do { - if (Symbol * parent{scope->symbol()}) { - symbol = parent; - } - scope = &scope->parent(); - } while (!scope->IsGlobal()); - - assert(symbol && - "Atomic construct must be within a scope associated with a symbol"); - return symbol; - }, - x.u)}; + Symbol *topLevelParent{[&]() { + Symbol *symbol{nullptr}; + Scope *scope{&context_.FindScope( + std::get(x.t).source)}; + do { + if (Symbol * parent{scope->symbol()}) { + symbol = parent; + } + scope = &scope->parent(); + } while (!scope->IsGlobal()); + + assert(symbol && + "Atomic construct must be within a scope associated with a symbol"); + return symbol; + }()}; // Get the `atomic_default_mem_order` clause from the top-level parent. std::optional defaultMemOrder; @@ -86,66 +84,48 @@ bool OmpRewriteMutator::Pre(parser::OpenMPAtomicConstruct &x) { return false; } - auto findMemOrderClause = - [](const std::list &clauses) { - return llvm::any_of(clauses, [](const auto &clause) { - return std::get_if(&clause.u); + auto findMemOrderClause{[](const parser::OmpClauseList &clauses) { + return llvm::any_of( + clauses.v, [](auto &clause) -> const parser::OmpClause * { + switch (clause.Id()) { + case llvm::omp::Clause::OMPC_acq_rel: + case llvm::omp::Clause::OMPC_acquire: + case llvm::omp::Clause::OMPC_relaxed: + case llvm::omp::Clause::OMPC_release: + case llvm::omp::Clause::OMPC_seq_cst: + return &clause; + default: + return nullptr; + } }); - }; - - // Get the clause list to which the new memory order clause must be added, - // only if there are no other memory order clauses present for this atomic - // directive. - std::list *clauseList = common::visit( - common::visitors{[&](parser::OmpAtomic &atomicConstruct) { - // OmpAtomic only has a single list of clauses. - auto &clauses{std::get( - atomicConstruct.t)}; - return !findMemOrderClause(clauses.v) ? &clauses.v - : nullptr; - }, - [&](auto &atomicConstruct) { - // All other atomic constructs have two lists of clauses. - auto &clausesLhs{std::get<0>(atomicConstruct.t)}; - auto &clausesRhs{std::get<2>(atomicConstruct.t)}; - return !findMemOrderClause(clausesLhs.v) && - !findMemOrderClause(clausesRhs.v) - ? &clausesRhs.v - : nullptr; - }}, - x.u); + }}; - // Add a memory order clause to the atomic directive. + auto &dirSpec{std::get(x.t)}; + auto &clauseList{std::get>(dirSpec.t)}; if (clauseList) { - atomicDirectiveDefaultOrderFound_ = true; - switch (*defaultMemOrder) { - case common::OmpMemoryOrderType::Acq_Rel: - clauseList->emplace_back(common::visit( - common::visitors{[](parser::OmpAtomicRead &) -> parser::OmpClause { - return parser::OmpClause::Acquire{}; - }, - [](parser::OmpAtomicCapture &) -> parser::OmpClause { - return parser::OmpClause::AcqRel{}; - }, - [](auto &) -> parser::OmpClause { - // parser::{OmpAtomic, OmpAtomicUpdate, OmpAtomicWrite} - return parser::OmpClause::Release{}; - }}, - x.u)); - break; - case common::OmpMemoryOrderType::Relaxed: - clauseList->emplace_back( - parser::OmpClause{parser::OmpClause::Relaxed{}}); - break; - case common::OmpMemoryOrderType::Seq_Cst: - clauseList->emplace_back( - parser::OmpClause{parser::OmpClause::SeqCst{}}); - break; - default: - // FIXME: Don't process other values at the moment since their validity - // depends on the OpenMP version (which is unavailable here). - break; + if (findMemOrderClause(*clauseList)) { + return false; } + } else { + clauseList = parser::OmpClauseList(decltype(parser::OmpClauseList::v){}); + } + + // Add a memory order clause to the atomic directive. + atomicDirectiveDefaultOrderFound_ = true; + switch (*defaultMemOrder) { + case common::OmpMemoryOrderType::Acq_Rel: + clauseList->v.emplace_back(parser::OmpClause{parser::OmpClause::AcqRel{}}); + break; + case common::OmpMemoryOrderType::Relaxed: + clauseList->v.emplace_back(parser::OmpClause{parser::OmpClause::Relaxed{}}); + break; + case common::OmpMemoryOrderType::Seq_Cst: + clauseList->v.emplace_back(parser::OmpClause{parser::OmpClause::SeqCst{}}); + break; + default: + // FIXME: Don't process other values at the moment since their validity + // depends on the OpenMP version (which is unavailable here). + break; } return false; diff --git a/flang/lib/Semantics/tools.cpp b/flang/lib/Semantics/tools.cpp index ac69e6ff5cb79..a1445187b1e98 100644 --- a/flang/lib/Semantics/tools.cpp +++ b/flang/lib/Semantics/tools.cpp @@ -17,6 +17,7 @@ #include "flang/Semantics/tools.h" #include "flang/Semantics/type.h" #include "flang/Support/Fortran.h" +#include "llvm/ADT/StringSwitch.h" #include "llvm/Support/raw_ostream.h" #include #include @@ -1770,4 +1771,318 @@ bool CheckForSymbolMatch(const SomeExpr *lhs, const SomeExpr *rhs) { } return false; } -} // namespace Fortran::semantics + +namespace operation { +template // +SomeExpr asSomeExpr(const T &x) { + auto copy{x}; + return AsGenericExpr(std::move(copy)); +} + +template // +struct ArgumentExtractor + : public evaluate::Traverse, + std::pair>, false> { + using Arguments = std::vector; + using Result = std::pair; + using Base = evaluate::Traverse, + Result, false>; + static constexpr auto IgnoreResizes = IgnoreResizingConverts; + static constexpr auto Logical = common::TypeCategory::Logical; + ArgumentExtractor() : Base(*this) {} + + Result Default() const { return {}; } + + using Base::operator(); + + template // + Result operator()( + const evaluate::Constant> &x) const { + if (const auto &val{x.GetScalarValue()}) { + return val->IsTrue() + ? std::make_pair(operation::Operator::True, Arguments{}) + : std::make_pair(operation::Operator::False, Arguments{}); + } + return Default(); + } + + template // + Result operator()(const evaluate::FunctionRef &x) const { + Result result{operation::OperationCode(x.proc()), {}}; + for (size_t i{0}, e{x.arguments().size()}; i != e; ++i) { + if (auto *e{x.UnwrapArgExpr(i)}) { + result.second.push_back(*e); + } + } + return result; + } + + template + Result operator()(const evaluate::Operation &x) const { + if constexpr (std::is_same_v>) { + // Ignore top-level parentheses. + return (*this)(x.template operand<0>()); + } + if constexpr (IgnoreResizes && + std::is_same_v>) { + // Ignore conversions within the same category. + // Atomic operations on int(kind=1) may be implicitly widened + // to int(kind=4) for example. + return (*this)(x.template operand<0>()); + } else { + return std::make_pair(operation::OperationCode(x), + OperationArgs(x, std::index_sequence_for{})); + } + } + + template // + Result operator()(const evaluate::Designator &x) const { + return {operation::Operator::Identity, {asSomeExpr(x)}}; + } + + template // + Result operator()(const evaluate::Constant &x) const { + return {operation::Operator::Identity, {asSomeExpr(x)}}; + } + + template // + Result Combine(Result &&result, Rs &&...results) const { + // There shouldn't be any combining needed, since we're stopping the + // traversal at the top-level operation, but implement one that picks + // the first non-empty result. + if constexpr (sizeof...(Rs) == 0) { + return std::move(result); + } else { + if (!result.second.empty()) { + return std::move(result); + } else { + return Combine(std::move(results)...); + } + } + } + +private: + template + Arguments OperationArgs(const evaluate::Operation &x, + std::index_sequence) const { + return Arguments{SomeExpr(x.template operand())...}; + } +}; +} // namespace operation + +std::string operation::ToString(operation::Operator op) { + switch (op) { + case Operator::Unknown: + return "??"; + case Operator::Add: + return "+"; + case Operator::And: + return "AND"; + case Operator::Associated: + return "ASSOCIATED"; + case Operator::Call: + return "function-call"; + case Operator::Constant: + return "constant"; + case Operator::Convert: + return "type-conversion"; + case Operator::Div: + return "/"; + case Operator::Eq: + return "=="; + case Operator::Eqv: + return "EQV"; + case Operator::False: + return ".FALSE."; + case Operator::Ge: + return ">="; + case Operator::Gt: + return ">"; + case Operator::Identity: + return "identity"; + case Operator::Intrinsic: + return "intrinsic"; + case Operator::Le: + return "<="; + case Operator::Lt: + return "<"; + case Operator::Max: + return "MAX"; + case Operator::Min: + return "MIN"; + case Operator::Mul: + return "*"; + case Operator::Ne: + return "/="; + case Operator::Neqv: + return "NEQV/EOR"; + case Operator::Not: + return "NOT"; + case Operator::Or: + return "OR"; + case Operator::Pow: + return "**"; + case Operator::Resize: + return "resize"; + case Operator::Sub: + return "-"; + case Operator::True: + return ".TRUE."; + } + llvm_unreachable("Unhandler operator"); +} + +operation::Operator operation::OperationCode( + const evaluate::ProcedureDesignator &proc) { + Operator code = llvm::StringSwitch(proc.GetName()) + .Case("associated", Operator::Associated) + .Case("min", Operator::Min) + .Case("max", Operator::Max) + .Case("iand", Operator::And) + .Case("ior", Operator::Or) + .Case("ieor", Operator::Neqv) + .Default(Operator::Call); + if (code == Operator::Call && proc.GetSpecificIntrinsic()) { + return Operator::Intrinsic; + } + return code; +} + +std::pair> GetTopLevelOperation( + const SomeExpr &expr) { + return operation::ArgumentExtractor{}(expr); +} + +namespace operation { +struct ConvertCollector + : public evaluate::Traverse>, false> { + using Result = std::pair>; + using Base = evaluate::Traverse; + ConvertCollector() : Base(*this) {} + + Result Default() const { return {}; } + + using Base::operator(); + + template // + Result operator()(const evaluate::Designator &x) const { + return {asSomeExpr(x), {}}; + } + + template // + Result operator()(const evaluate::FunctionRef &x) const { + return {asSomeExpr(x), {}}; + } + + template // + Result operator()(const evaluate::Constant &x) const { + return {asSomeExpr(x), {}}; + } + + template + Result operator()(const evaluate::Operation &x) const { + if constexpr (std::is_same_v>) { + // Ignore parentheses. + return (*this)(x.template operand<0>()); + } else if constexpr (is_convert_v) { + // Convert should always have a typed result, so it should be safe to + // dereference x.GetType(). + return Combine( + {std::nullopt, {*x.GetType()}}, (*this)(x.template operand<0>())); + } else if constexpr (is_complex_constructor_v) { + // This is a conversion iff the imaginary operand is 0. + if (IsZero(x.template operand<1>())) { + return Combine( + {std::nullopt, {*x.GetType()}}, (*this)(x.template operand<0>())); + } else { + return {asSomeExpr(x.derived()), {}}; + } + } else { + return {asSomeExpr(x.derived()), {}}; + } + } + + template // + Result Combine(Result &&result, Rs &&...results) const { + Result v(std::move(result)); + auto setValue{[](MaybeExpr &x, MaybeExpr &&y) { + assert((!x.has_value() || !y.has_value()) && "Multiple designators"); + if (!x.has_value()) { + x = std::move(y); + } + }}; + auto moveAppend{[](auto &accum, auto &&other) { + for (auto &&s : other) { + accum.push_back(std::move(s)); + } + }}; + (setValue(v.first, std::move(results).first), ...); + (moveAppend(v.second, std::move(results).second), ...); + return v; + } + +private: + template // + static bool IsZero(const T &x) { + return false; + } + template // + static bool IsZero(const evaluate::Expr &x) { + return common::visit([](auto &&s) { return IsZero(s); }, x.u); + } + template // + static bool IsZero(const evaluate::Constant &x) { + if (auto &&maybeScalar{x.GetScalarValue()}) { + return maybeScalar->IsZero(); + } else { + return false; + } + } + + template // + struct is_convert { + static constexpr bool value{false}; + }; + template // + struct is_convert> { + static constexpr bool value{true}; + }; + template // + struct is_convert> { + // Conversion from complex to real. + static constexpr bool value{true}; + }; + template // + static constexpr bool is_convert_v = is_convert::value; + + template // + struct is_complex_constructor { + static constexpr bool value{false}; + }; + template // + struct is_complex_constructor> { + static constexpr bool value{true}; + }; + template // + static constexpr bool is_complex_constructor_v = + is_complex_constructor::value; +}; +} // namespace operation + +MaybeExpr GetConvertInput(const SomeExpr &x) { + // This returns SomeExpr(x) when x is a designator/functionref/constant. + return operation::ConvertCollector{}(x).first; +} + +bool IsSameOrConvertOf(const SomeExpr &expr, const SomeExpr &x) { + // Check if expr is same as x, or a sequence of Convert operations on x. + if (expr == x) { + return true; + } else if (auto maybe{GetConvertInput(expr)}) { + return *maybe == x; + } else { + return false; + } +} +} // namespace Fortran::semantics \ No newline at end of file diff --git a/flang/test/Examples/omp-atomic.f90 b/flang/test/Examples/omp-atomic.f90 index dcca34b633a3e..934f84f132484 100644 --- a/flang/test/Examples/omp-atomic.f90 +++ b/flang/test/Examples/omp-atomic.f90 @@ -26,25 +26,31 @@ ! CHECK:--- ! CHECK-NEXT:- file: '{{[^"]*}}omp-atomic.f90' ! CHECK-NEXT: line: 9 -! CHECK-NEXT: construct: atomic-read +! CHECK-NEXT: construct: atomic ! CHECK-NEXT: clauses: -! CHECK-NEXT: - clause: seq_cst +! CHECK-NEXT: - clause: read ! CHECK-NEXT: details: '' +! CHECK-NEXT: - clause: seq_cst +! CHECK-NEXT: details: 'name_modifier=atomic;' ! CHECK-NEXT:- file: '{{[^"]*}}omp-atomic.f90' ! CHECK-NEXT: line: 12 -! CHECK-NEXT: construct: atomic-write +! CHECK-NEXT: construct: atomic ! CHECK-NEXT: clauses: ! CHECK-NEXT: - clause: seq_cst +! CHECK-NEXT: details: 'name_modifier=atomic;' +! CHECK-NEXT: - clause: write ! CHECK-NEXT: details: '' ! CHECK-NEXT:- file: '{{[^"]*}}omp-atomic.f90' ! CHECK-NEXT: line: 16 -! CHECK-NEXT: construct: atomic-capture +! CHECK-NEXT: construct: atomic ! CHECK-NEXT: clauses: +! CHECK-NEXT: - clause: capture +! CHECK-NEXT: details: 'name_modifier=atomic;name_modifier=atomic;' ! CHECK-NEXT: - clause: seq_cst ! CHECK-NEXT: details: '' ! CHECK-NEXT:- file: '{{[^"]*}}omp-atomic.f90' ! CHECK-NEXT: line: 21 -! CHECK-NEXT: construct: atomic-atomic +! CHECK-NEXT: construct: atomic ! CHECK-NEXT: clauses: [] ! CHECK-NEXT:- file: '{{[^"]*}}omp-atomic.f90' ! CHECK-NEXT: line: 8 diff --git a/flang/test/Lower/OpenMP/Todo/atomic-compare-fail.f90 b/flang/test/Lower/OpenMP/Todo/atomic-compare-fail.f90 index b82bd13622764..6f58e0939a787 100644 --- a/flang/test/Lower/OpenMP/Todo/atomic-compare-fail.f90 +++ b/flang/test/Lower/OpenMP/Todo/atomic-compare-fail.f90 @@ -1,6 +1,6 @@ ! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s -! CHECK: not yet implemented: OpenMP atomic compare +! CHECK: not yet implemented: OpenMP ATOMIC COMPARE program p integer :: x logical :: r diff --git a/flang/test/Lower/OpenMP/Todo/atomic-compare.f90 b/flang/test/Lower/OpenMP/Todo/atomic-compare.f90 index 88ec6fe910b9e..6729be6e5cf8b 100644 --- a/flang/test/Lower/OpenMP/Todo/atomic-compare.f90 +++ b/flang/test/Lower/OpenMP/Todo/atomic-compare.f90 @@ -1,6 +1,6 @@ ! RUN: %not_todo_cmd %flang_fc1 -emit-fir -fopenmp -fopenmp-version=51 -o - %s 2>&1 | FileCheck %s -! CHECK: not yet implemented: OpenMP atomic compare +! CHECK: not yet implemented: OpenMP ATOMIC COMPARE program p integer :: x logical :: r diff --git a/flang/test/Lower/OpenMP/atomic-capture.f90 b/flang/test/Lower/OpenMP/atomic-capture.f90 index 2f800d534dc36..14fd0c942a9b4 100644 --- a/flang/test/Lower/OpenMP/atomic-capture.f90 +++ b/flang/test/Lower/OpenMP/atomic-capture.f90 @@ -79,16 +79,16 @@ subroutine pointers_in_atomic_capture() !CHECK: %[[VAL_A_BOX_ADDR:.*]] = fir.box_addr %[[VAL_A_LOADED]] : (!fir.box>) -> !fir.ptr !CHECK: %[[VAL_B_LOADED:.*]] = fir.load %[[VAL_B_DECLARE]]#0 : !fir.ref>> !CHECK: %[[VAL_B_BOX_ADDR:.*]] = fir.box_addr %[[VAL_B_LOADED]] : (!fir.box>) -> !fir.ptr +!CHECK: %[[VAL_B:.*]] = fir.load %[[VAL_B_BOX_ADDR]] : !fir.ptr !CHECK: %[[VAL_B_LOADED_2:.*]] = fir.load %[[VAL_B_DECLARE]]#0 : !fir.ref>> !CHECK: %[[VAL_B_BOX_ADDR_2:.*]] = fir.box_addr %[[VAL_B_LOADED_2]] : (!fir.box>) -> !fir.ptr -!CHECK: %[[VAL_B:.*]] = fir.load %[[VAL_B_BOX_ADDR_2]] : !fir.ptr !CHECK: omp.atomic.capture { !CHECK: omp.atomic.update %[[VAL_A_BOX_ADDR]] : !fir.ptr { !CHECK: ^bb0(%[[ARG:.*]]: i32): !CHECK: %[[TEMP:.*]] = arith.addi %[[ARG]], %[[VAL_B]] : i32 !CHECK: omp.yield(%[[TEMP]] : i32) !CHECK: } -!CHECK: omp.atomic.read %[[VAL_B_BOX_ADDR]] = %[[VAL_A_BOX_ADDR]] : !fir.ptr, !fir.ptr, i32 +!CHECK: omp.atomic.read %[[VAL_B_BOX_ADDR_2]] = %[[VAL_A_BOX_ADDR]] : !fir.ptr, !fir.ptr, i32 !CHECK: } !CHECK: return !CHECK: } diff --git a/flang/test/Lower/OpenMP/atomic-implicit-cast.f90 b/flang/test/Lower/OpenMP/atomic-implicit-cast.f90 index 4c1be1ca91ac0..5e00235b85e74 100644 --- a/flang/test/Lower/OpenMP/atomic-implicit-cast.f90 +++ b/flang/test/Lower/OpenMP/atomic-implicit-cast.f90 @@ -1,5 +1,3 @@ -! REQUIRES : openmp_runtime - ! RUN: %flang_fc1 -emit-hlfir -fopenmp %s -o - | FileCheck %s ! CHECK: func.func @_QPatomic_implicit_cast_read() { @@ -97,9 +95,9 @@ subroutine atomic_implicit_cast_read ! CHECK: } ! CHECK: omp.atomic.read %[[ALLOCA6]] = %[[X_DECL]]#0 : !fir.ref, !fir.ref, i32 ! CHECK: %[[LOAD:.*]] = fir.load %[[ALLOCA6]] : !fir.ref -! CHECK: %[[UNDEF:.*]] = fir.undefined complex ! CHECK: %[[CVT:.*]] = fir.convert %[[LOAD]] : (i32) -> f32 ! CHECK: %[[CST:.*]] = arith.constant 0.000000e+00 : f32 +! CHECK: %[[UNDEF:.*]] = fir.undefined complex ! CHECK: %[[IDX1:.*]] = fir.insert_value %[[UNDEF]], %[[CVT]], [0 : index] : (complex, f32) -> complex ! CHECK: %[[IDX2:.*]] = fir.insert_value %[[IDX1]], %[[CST]], [1 : index] : (complex, f32) -> complex ! CHECK: fir.store %[[IDX2]] to %[[W_DECL]]#0 : !fir.ref> @@ -109,14 +107,14 @@ subroutine atomic_implicit_cast_read !$omp end atomic -! CHECK: omp.atomic.capture { -! CHECK: omp.atomic.update %[[M_DECL]]#0 : !fir.ref> { -! CHECK: ^bb0(%[[ARG:.*]]: complex): ! CHECK: %[[CST1:.*]] = arith.constant 1.000000e+00 : f64 ! CHECK: %[[CST2:.*]] = arith.constant 0.000000e+00 : f64 ! CHECK: %[[UNDEF:.*]] = fir.undefined complex ! CHECK: %[[IDX1:.*]] = fir.insert_value %[[UNDEF]], %[[CST1]], [0 : index] : (complex, f64) -> complex ! CHECK: %[[IDX2:.*]] = fir.insert_value %[[IDX1]], %[[CST2]], [1 : index] : (complex, f64) -> complex +! CHECK: omp.atomic.capture { +! CHECK: omp.atomic.update %[[M_DECL]]#0 : !fir.ref> { +! CHECK: ^bb0(%[[ARG:.*]]: complex): ! CHECK: %[[RESULT:.*]] = fir.addc %[[ARG]], %[[IDX2]] {fastmath = #arith.fastmath} : complex ! CHECK: omp.yield(%[[RESULT]] : complex) ! CHECK: } diff --git a/flang/test/Lower/OpenMP/atomic-privatize.f90 b/flang/test/Lower/OpenMP/atomic-privatize.f90 index f922095264fca..c876266cf018c 100644 --- a/flang/test/Lower/OpenMP/atomic-privatize.f90 +++ b/flang/test/Lower/OpenMP/atomic-privatize.f90 @@ -8,7 +8,7 @@ !CHECK: omp.task private(@_QFfredEprv_firstprivate_i32 %{{[0-9]+}}#0 -> %arg0 !CHECK: %[[DECL:[0-9]+]]:2 = hlfir.declare %arg0 {uniq_name = "_QFfredEprv"} -!CHECK: omp.atomic.update %[[DECL]]#0 +!CHECK: omp.atomic.update memory_order(relaxed) %[[DECL]]#0 integer function fred integer :: prv diff --git a/flang/test/Lower/OpenMP/atomic-write.f90 b/flang/test/Lower/OpenMP/atomic-write.f90 index 13392ad76471f..6eded49b0b15d 100644 --- a/flang/test/Lower/OpenMP/atomic-write.f90 +++ b/flang/test/Lower/OpenMP/atomic-write.f90 @@ -44,9 +44,9 @@ end program OmpAtomicWrite !CHECK-LABEL: func.func @_QPatomic_write_pointer() { !CHECK: %[[X_REF:.*]] = fir.alloca !fir.box> {bindc_name = "x", uniq_name = "_QFatomic_write_pointerEx"} !CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_REF]] {fortran_attrs = #fir.var_attrs, uniq_name = "_QFatomic_write_pointerEx"} : (!fir.ref>>) -> (!fir.ref>>, !fir.ref>>) -!CHECK: %[[C1:.*]] = arith.constant 1 : i32 !CHECK: %[[X_ADDR_BOX:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref>> !CHECK: %[[X_POINTEE_ADDR:.*]] = fir.box_addr %[[X_ADDR_BOX]] : (!fir.box>) -> !fir.ptr +!CHECK: %[[C1:.*]] = arith.constant 1 : i32 !CHECK: omp.atomic.write %[[X_POINTEE_ADDR]] = %[[C1]] : !fir.ptr, i32 !CHECK: %[[C2:.*]] = arith.constant 2 : i32 !CHECK: %[[X_ADDR_BOX:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref>> diff --git a/flang/test/Lower/OpenMP/dump-atomic-analysis.f90 b/flang/test/Lower/OpenMP/dump-atomic-analysis.f90 new file mode 100644 index 0000000000000..cbaf7bc9f2d8a --- /dev/null +++ b/flang/test/Lower/OpenMP/dump-atomic-analysis.f90 @@ -0,0 +1,82 @@ +!RUN: %flang_fc1 -fopenmp -fopenmp-version=60 -emit-hlfir -mmlir -fdebug-dump-atomic-analysis %s -o /dev/null 2>&1 | FileCheck %s + +subroutine f00(x) + integer :: x, v + !$omp atomic read + v = x +end + +!CHECK: Analysis { +!CHECK-NEXT: atom: x +!CHECK-NEXT: cond: +!CHECK-NEXT: op0 { +!CHECK-NEXT: what: Read +!CHECK-NEXT: assign: v=x +!CHECK-NEXT: } +!CHECK-NEXT: op1 { +!CHECK-NEXT: what: None +!CHECK-NEXT: assign: +!CHECK-NEXT: } +!CHECK-NEXT: } + + +subroutine f01(v) + integer :: x, v + !$omp atomic write + x = v +end + +!CHECK: Analysis { +!CHECK-NEXT: atom: x +!CHECK-NEXT: cond: +!CHECK-NEXT: op0 { +!CHECK-NEXT: what: Write +!CHECK-NEXT: assign: x=v +!CHECK-NEXT: } +!CHECK-NEXT: op1 { +!CHECK-NEXT: what: None +!CHECK-NEXT: assign: +!CHECK-NEXT: } +!CHECK-NEXT: } + + +subroutine f02(x, v) + integer :: x, v + !$omp atomic update + x = x + v +end + +!CHECK: Analysis { +!CHECK-NEXT: atom: x +!CHECK-NEXT: cond: +!CHECK-NEXT: op0 { +!CHECK-NEXT: what: Update +!CHECK-NEXT: assign: x=x+v +!CHECK-NEXT: } +!CHECK-NEXT: op1 { +!CHECK-NEXT: what: None +!CHECK-NEXT: assign: +!CHECK-NEXT: } +!CHECK-NEXT: } + + +subroutine f03(x, v) + integer :: x, v, t + !$omp atomic update capture + t = x + x = x + v + !$omp end atomic +end + +!CHECK: Analysis { +!CHECK-NEXT: atom: x +!CHECK-NEXT: cond: +!CHECK-NEXT: op0 { +!CHECK-NEXT: what: Read +!CHECK-NEXT: assign: t=x +!CHECK-NEXT: } +!CHECK-NEXT: op1 { +!CHECK-NEXT: what: Update +!CHECK-NEXT: assign: x=x+v +!CHECK-NEXT: } +!CHECK-NEXT: } diff --git a/flang/test/Parser/OpenMP/atomic-compare.f90 b/flang/test/Parser/OpenMP/atomic-compare.f90 index 5cd02698ff482..e09da4a359fcc 100644 --- a/flang/test/Parser/OpenMP/atomic-compare.f90 +++ b/flang/test/Parser/OpenMP/atomic-compare.f90 @@ -1,16 +1,290 @@ -! RUN: not %flang_fc1 -fopenmp-version=51 -fopenmp %s 2>&1 | FileCheck %s -! OpenMP version for documentation purposes only - it isn't used until Sema. -! This is testing for Parser errors that bail out before Sema. -program main - implicit none - integer :: i, j = 10 - logical :: r - - !CHECK: error: expected OpenMP construct - !$omp atomic compare write - r = i .eq. j + 1 - - !CHECK: error: expected end of line - !$omp atomic compare num_threads(4) - r = i .eq. j -end program main +!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=60 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s +!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=60 %s | FileCheck --check-prefix="PARSE-TREE" %s + +subroutine f00(a, b) + integer :: a, b + integer :: x + !$omp atomic update compare + if (x < a) x = b +end + +!UNPARSE: SUBROUTINE f00 (a, b) +!UNPARSE: INTEGER a, b +!UNPARSE: INTEGER x +!UNPARSE: !$OMP ATOMIC UPDATE COMPARE +!UNPARSE: IF (x ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct +!PARSE-TREE: | OmpDirectiveSpecification +!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic +!PARSE-TREE: | | OmpClauseList -> OmpClause -> Update -> +!PARSE-TREE: | | OmpClause -> Compare +!PARSE-TREE: | | Flags = None +!PARSE-TREE: | Block +!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> IfStmt +!PARSE-TREE: | | | Scalar -> Logical -> Expr = 'x DataRef -> Name = 'x' +!PARSE-TREE: | | | | | Expr = 'a' +!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'a' +!PARSE-TREE: | | | ActionStmt -> AssignmentStmt = 'x=b' +!PARSE-TREE: | | | | Variable = 'x' +!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | | | Expr = 'b' +!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'b' + +subroutine f01(a, b) + integer :: a, b + integer :: x + !$omp atomic update compare + if (x < a) then + x = b + endif +end + +!UNPARSE: SUBROUTINE f01 (a, b) +!UNPARSE: INTEGER a, b +!UNPARSE: INTEGER x +!UNPARSE: !$OMP ATOMIC UPDATE COMPARE +!UNPARSE: IF (x ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct +!PARSE-TREE: | OmpDirectiveSpecification +!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic +!PARSE-TREE: | | OmpClauseList -> OmpClause -> Update -> +!PARSE-TREE: | | OmpClause -> Compare +!PARSE-TREE: | | Flags = None +!PARSE-TREE: | Block +!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> IfConstruct +!PARSE-TREE: | | | IfThenStmt +!PARSE-TREE: | | | | Scalar -> Logical -> Expr = 'x DataRef -> Name = 'x' +!PARSE-TREE: | | | | | | Expr = 'a' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'a' +!PARSE-TREE: | | | Block +!PARSE-TREE: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=b' +!PARSE-TREE: | | | | | Variable = 'x' +!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | | | | Expr = 'b' +!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'b' +!PARSE-TREE: | | | EndIfStmt -> + +subroutine f02(a, b) + integer :: a, b + integer :: x + logical :: c + c = x < a + !$omp atomic update compare + if (c) then + x = b + endif +end + +!UNPARSE: SUBROUTINE f02 (a, b) +!UNPARSE: INTEGER a, b +!UNPARSE: INTEGER x +!UNPARSE: LOGICAL c +!UNPARSE: c=x ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'c=x DataRef -> Name = 'c' +!PARSE-TREE: | Expr = 'x DataRef -> Name = 'x' +!PARSE-TREE: | | | Expr = 'a' +!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'a' +!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct +!PARSE-TREE: | OmpDirectiveSpecification +!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic +!PARSE-TREE: | | OmpClauseList -> OmpClause -> Update -> +!PARSE-TREE: | | OmpClause -> Compare +!PARSE-TREE: | | Flags = None +!PARSE-TREE: | Block +!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> IfConstruct +!PARSE-TREE: | | | IfThenStmt +!PARSE-TREE: | | | | Scalar -> Logical -> Expr = 'c' +!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'c' +!PARSE-TREE: | | | Block +!PARSE-TREE: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=b' +!PARSE-TREE: | | | | | Variable = 'x' +!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | | | | Expr = 'b' +!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'b' +!PARSE-TREE: | | | EndIfStmt -> + +subroutine g00(a, b) + integer :: a, b + integer :: x, v + !$omp atomic update capture compare + v = x + if (x < a) x = b + !$omp end atomic +end + +!UNPARSE: SUBROUTINE g00 (a, b) +!UNPARSE: INTEGER a, b +!UNPARSE: INTEGER x, v +!UNPARSE: !$OMP ATOMIC UPDATE CAPTURE COMPARE +!UNPARSE: v=x +!UNPARSE: IF (x ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct +!PARSE-TREE: | OmpDirectiveSpecification +!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic +!PARSE-TREE: | | OmpClauseList -> OmpClause -> Update -> +!PARSE-TREE: | | OmpClause -> Capture +!PARSE-TREE: | | OmpClause -> Compare +!PARSE-TREE: | | Flags = None +!PARSE-TREE: | Block +!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'v=x' +!PARSE-TREE: | | | Variable = 'v' +!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'v' +!PARSE-TREE: | | | Expr = 'x' +!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> IfStmt +!PARSE-TREE: | | | Scalar -> Logical -> Expr = 'x DataRef -> Name = 'x' +!PARSE-TREE: | | | | | Expr = 'a' +!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'a' +!PARSE-TREE: | | | ActionStmt -> AssignmentStmt = 'x=b' +!PARSE-TREE: | | | | Variable = 'x' +!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | | | Expr = 'b' +!PARSE-TREE: | | | | | Designator -> DataRef -> Name = 'b' +!PARSE-TREE: | OmpDirectiveSpecification +!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic +!PARSE-TREE: | | OmpClauseList -> +!PARSE-TREE: | | Flags = None + +subroutine g01(a, b) + integer :: a, b + integer :: x, v + !$omp atomic update capture compare + v = x + if (x < a) then + x = b + endif + !$omp end atomic +end + +!UNPARSE: SUBROUTINE g01 (a, b) +!UNPARSE: INTEGER a, b +!UNPARSE: INTEGER x, v +!UNPARSE: !$OMP ATOMIC UPDATE CAPTURE COMPARE +!UNPARSE: v=x +!UNPARSE: IF (x ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct +!PARSE-TREE: | OmpDirectiveSpecification +!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic +!PARSE-TREE: | | OmpClauseList -> OmpClause -> Update -> +!PARSE-TREE: | | OmpClause -> Capture +!PARSE-TREE: | | OmpClause -> Compare +!PARSE-TREE: | | Flags = None +!PARSE-TREE: | Block +!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'v=x' +!PARSE-TREE: | | | Variable = 'v' +!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'v' +!PARSE-TREE: | | | Expr = 'x' +!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> IfConstruct +!PARSE-TREE: | | | IfThenStmt +!PARSE-TREE: | | | | Scalar -> Logical -> Expr = 'x DataRef -> Name = 'x' +!PARSE-TREE: | | | | | | Expr = 'a' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'a' +!PARSE-TREE: | | | Block +!PARSE-TREE: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=b' +!PARSE-TREE: | | | | | Variable = 'x' +!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | | | | Expr = 'b' +!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'b' +!PARSE-TREE: | | | EndIfStmt -> +!PARSE-TREE: | OmpDirectiveSpecification +!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic +!PARSE-TREE: | | OmpClauseList -> +!PARSE-TREE: | | Flags = None + +subroutine g02(a, b) + integer :: a, b + integer :: x, v + !$omp atomic update capture compare + if (x < a) then + x = b + else + v = x + endif + !$omp end atomic +end + +!UNPARSE: SUBROUTINE g02 (a, b) +!UNPARSE: INTEGER a, b +!UNPARSE: INTEGER x, v +!UNPARSE: !$OMP ATOMIC UPDATE CAPTURE COMPARE +!UNPARSE: IF (x ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct +!PARSE-TREE: | OmpDirectiveSpecification +!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic +!PARSE-TREE: | | OmpClauseList -> OmpClause -> Update -> +!PARSE-TREE: | | OmpClause -> Capture +!PARSE-TREE: | | OmpClause -> Compare +!PARSE-TREE: | | Flags = None +!PARSE-TREE: | Block +!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> IfConstruct +!PARSE-TREE: | | | IfThenStmt +!PARSE-TREE: | | | | Scalar -> Logical -> Expr = 'x DataRef -> Name = 'x' +!PARSE-TREE: | | | | | | Expr = 'a' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'a' +!PARSE-TREE: | | | Block +!PARSE-TREE: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'x=b' +!PARSE-TREE: | | | | | Variable = 'x' +!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | | | | Expr = 'b' +!PARSE-TREE: | | | | | | Designator -> DataRef -> Name = 'b' +!PARSE-TREE: | | | ElseBlock +!PARSE-TREE: | | | | ElseStmt -> +!PARSE-TREE: | | | | Block +!PARSE-TREE: | | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'v=x' +!PARSE-TREE: | | | | | | Variable = 'v' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'v' +!PARSE-TREE: | | | | | | Expr = 'x' +!PARSE-TREE: | | | | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | | | EndIfStmt -> +!PARSE-TREE: | OmpDirectiveSpecification +!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic +!PARSE-TREE: | | OmpClauseList -> +!PARSE-TREE: | | Flags = None diff --git a/flang/test/Parser/OpenMP/atomic-end.f90 b/flang/test/Parser/OpenMP/atomic-end.f90 new file mode 100644 index 0000000000000..e5eac87517b1e --- /dev/null +++ b/flang/test/Parser/OpenMP/atomic-end.f90 @@ -0,0 +1,63 @@ +!RUN: %flang_fc1 -fdebug-unparse -fopenmp -fopenmp-version=60 %s | FileCheck --ignore-case --check-prefix="UNPARSE" %s +!RUN: %flang_fc1 -fdebug-dump-parse-tree -fopenmp -fopenmp-version=60 %s | FileCheck --check-prefix="PARSE-TREE" %s + +subroutine f00 + integer :: x, v + !$omp atomic read + v = x + !$omp end atomic +end + +!UNPARSE: SUBROUTINE f00 +!UNPARSE: INTEGER x, v +!UNPARSE: !$OMP ATOMIC READ +!UNPARSE: v=x +!UNPARSE: !$OMP END ATOMIC +!UNPARSE: END SUBROUTINE + +!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct +!PARSE-TREE: | OmpDirectiveSpecification +!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic +!PARSE-TREE: | | OmpClauseList -> OmpClause -> Read +!PARSE-TREE: | | Flags = None +!PARSE-TREE: | Block +!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'v=x' +!PARSE-TREE: | | | Variable = 'v' +!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'v' +!PARSE-TREE: | | | Expr = 'x' +!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | OmpDirectiveSpecification +!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic +!PARSE-TREE: | | OmpClauseList -> +!PARSE-TREE: | | Flags = None + + +subroutine f01 + integer :: x, v + !$omp atomic read + v = x + !$omp endatomic +end + +!UNPARSE: SUBROUTINE f01 +!UNPARSE: INTEGER x, v +!UNPARSE: !$OMP ATOMIC READ +!UNPARSE: v=x +!UNPARSE: !$OMP END ATOMIC +!UNPARSE: END SUBROUTINE + +!PARSE-TREE: ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPAtomicConstruct +!PARSE-TREE: | OmpDirectiveSpecification +!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic +!PARSE-TREE: | | OmpClauseList -> OmpClause -> Read +!PARSE-TREE: | | Flags = None +!PARSE-TREE: | Block +!PARSE-TREE: | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AssignmentStmt = 'v=x' +!PARSE-TREE: | | | Variable = 'v' +!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'v' +!PARSE-TREE: | | | Expr = 'x' +!PARSE-TREE: | | | | Designator -> DataRef -> Name = 'x' +!PARSE-TREE: | OmpDirectiveSpecification +!PARSE-TREE: | | OmpDirectiveName -> llvm::omp::Directive = atomic +!PARSE-TREE: | | OmpClauseList -> +!PARSE-TREE: | | Flags = None diff --git a/flang/test/Semantics/OpenMP/atomic-compare.f90 b/flang/test/Semantics/OpenMP/atomic-compare.f90 index 54492bf6a22a6..11e23e062bce7 100644 --- a/flang/test/Semantics/OpenMP/atomic-compare.f90 +++ b/flang/test/Semantics/OpenMP/atomic-compare.f90 @@ -44,46 +44,37 @@ !$omp end atomic ! Check for error conditions: - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the COMPARE directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic seq_cst seq_cst compare if (b .eq. c) b = a - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the COMPARE directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic compare seq_cst seq_cst if (b .eq. c) b = a - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the COMPARE directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic seq_cst compare seq_cst if (b .eq. c) b = a - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one ACQUIRE clause can appear on the COMPARE directive + !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive !$omp atomic acquire acquire compare if (b .eq. c) b = a - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one ACQUIRE clause can appear on the COMPARE directive + !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive !$omp atomic compare acquire acquire if (b .eq. c) b = a - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one ACQUIRE clause can appear on the COMPARE directive + !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive !$omp atomic acquire compare acquire if (b .eq. c) b = a - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the COMPARE directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic relaxed relaxed compare if (b .eq. c) b = a - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the COMPARE directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic compare relaxed relaxed if (b .eq. c) b = a - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the COMPARE directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic relaxed compare relaxed if (b .eq. c) b = a - !ERROR: More than one FAIL clause not allowed on OpenMP ATOMIC construct + !ERROR: At most one FAIL clause can appear on the ATOMIC directive !$omp atomic fail(release) compare fail(release) if (c .eq. a) a = b !$omp end atomic diff --git a/flang/test/Semantics/OpenMP/atomic-hint-clause.f90 b/flang/test/Semantics/OpenMP/atomic-hint-clause.f90 index c13a11a8dd5dc..8adb0f1a67409 100644 --- a/flang/test/Semantics/OpenMP/atomic-hint-clause.f90 +++ b/flang/test/Semantics/OpenMP/atomic-hint-clause.f90 @@ -16,20 +16,21 @@ program sample !$omp atomic read hint(2) y = x - !ERROR: Hint clause value is not a valid OpenMP synchronization value + !ERROR: The synchronization hint is not valid !$omp atomic hint(3) y = y + 10 !$omp atomic update hint(5) y = x + y - !ERROR: Hint clause value is not a valid OpenMP synchronization value + !ERROR: The synchronization hint is not valid !$omp atomic hint(7) capture + !WARNING: In ATOMIC UPDATE operation with CAPTURE either statement could be the update and the capture, assuming the first one is the capture statement y = x x = y !$omp end atomic - !ERROR: Hint clause must have non-negative constant integer expression + !ERROR: Synchronization hint must be a constant integer value !ERROR: Must be a constant value !$omp atomic update hint(x) y = y * 1 @@ -46,7 +47,7 @@ program sample !$omp atomic hint(omp_lock_hint_speculative) x = y + x - !ERROR: Hint clause must have non-negative constant integer expression + !ERROR: Synchronization hint must be a constant integer value !ERROR: Must be a constant value !$omp atomic hint(omp_sync_hint_uncontended + omp_sync_hint) read y = x @@ -69,36 +70,36 @@ program sample !$omp atomic hint(omp_lock_hint_contended + omp_sync_hint_nonspeculative) x = y + x - !ERROR: Hint clause value is not a valid OpenMP synchronization value + !ERROR: The synchronization hint is not valid !$omp atomic hint(omp_sync_hint_uncontended + omp_sync_hint_contended) read y = x - !ERROR: Hint clause value is not a valid OpenMP synchronization value + !ERROR: The synchronization hint is not valid !$omp atomic hint(omp_sync_hint_nonspeculative + omp_lock_hint_speculative) y = y * 9 - !ERROR: Hint clause must have non-negative constant integer expression + !ERROR: Synchronization hint must be a constant integer value !ERROR: Must have INTEGER type, but is REAL(4) !$omp atomic hint(1.0) read y = x - !ERROR: Hint clause must have non-negative constant integer expression + !ERROR: Synchronization hint must be a constant integer value !ERROR: Operands of + must be numeric; have LOGICAL(4) and INTEGER(4) !$omp atomic hint(z + omp_sync_hint_nonspeculative) read y = x - !ERROR: Hint clause must have non-negative constant integer expression + !ERROR: Synchronization hint must be a constant integer value !ERROR: Must be a constant value !$omp atomic hint(k + omp_sync_hint_speculative) read y = x - !ERROR: Hint clause must have non-negative constant integer expression + !ERROR: Synchronization hint must be a constant integer value !ERROR: Must be a constant value !$omp atomic hint(p(1) + omp_sync_hint_uncontended) write x = 10 * y !$omp atomic write hint(a) - !ERROR: RHS expression on atomic assignment statement cannot access 'x' + !ERROR: Within atomic operation x and y+x access the same storage x = y + x !$omp atomic hint(abs(-1)) write diff --git a/flang/test/Semantics/OpenMP/atomic-read.f90 b/flang/test/Semantics/OpenMP/atomic-read.f90 new file mode 100644 index 0000000000000..06c301cb78b77 --- /dev/null +++ b/flang/test/Semantics/OpenMP/atomic-read.f90 @@ -0,0 +1,118 @@ +!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=60 + +subroutine f00 + integer :: x, v + ! The end-directive is optional in ATOMIC READ. Expect no diagnostics. + !$omp atomic read + v = x + + !$omp atomic read + v = x + !$omp end atomic +end + +subroutine f01 + integer, pointer :: x, v + ! Intrinsic assignment and pointer assignment are both ok. Expect no + ! diagnostics. + !$omp atomic read + v = x + + !$omp atomic read + v => x +end + +subroutine f02(i) + integer :: i, v + interface + function p(i) + integer, pointer :: p + integer :: i + end + end interface + + ! Atomic variable can be a function reference. Expect no diagostics. + !$omp atomic read + v = p(i) +end + +subroutine f03 + integer :: x(3), y(5), v(3) + + !$omp atomic read + !ERROR: Atomic variable x should be a scalar + v = x + + !$omp atomic read + !ERROR: Atomic variable y(2_8:4_8:1_8) should be a scalar + v = y(2:4) +end + +subroutine f04 + integer :: x, y(3), v + + !$omp atomic read + !ERROR: Within atomic operation x and x access the same storage + x = x + + ! Accessing same array, but not the same storage. Expect no diagnostics. + !$omp atomic read + y(1) = y(2) +end + +subroutine f05 + integer :: x, v + + !$omp atomic read + !ERROR: Atomic expression x+1_4 should be a variable + v = x + 1 +end + +subroutine f06 + character :: x, v + + !$omp atomic read + !ERROR: Atomic variable x cannot have CHARACTER type + v = x +end + +subroutine f07 + integer, allocatable :: x + integer :: v + + allocate(x) + + !$omp atomic read + !ERROR: Atomic variable x cannot be ALLOCATABLE + v = x +end + +subroutine f08 + type :: struct + integer :: m + end type + type(struct) :: x, v + + !$omp atomic read + !ERROR: Atomic variable x should have an intrinsic type + v = x +end + +subroutine f09(x, v) + class(*), pointer :: x, v + + !$omp atomic read + !ERROR: Atomic variable x cannot be a pointer to a polymorphic type + v => x +end + +subroutine f10(x, v) + type struct(length) + integer, len :: length + end type + type(struct(*)), pointer :: x, v + + !$omp atomic read + !ERROR: Atomic variable x is a pointer to a type with non-constant length parameter + v => x +end diff --git a/flang/test/Semantics/OpenMP/atomic-update-capture.f90 b/flang/test/Semantics/OpenMP/atomic-update-capture.f90 new file mode 100644 index 0000000000000..f808ed916fb7e --- /dev/null +++ b/flang/test/Semantics/OpenMP/atomic-update-capture.f90 @@ -0,0 +1,77 @@ +!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=60 + +subroutine f00 + integer :: x, y, v + + !ERROR: ATOMIC UPDATE operation with CAPTURE should contain two statements + !$omp atomic update capture + x = v + x = x + 1 + y = x + !$omp end atomic +end + +subroutine f01 + integer :: x, y, v + + !ERROR: ATOMIC UPDATE operation with CAPTURE should contain two assignments + !$omp atomic update capture + x = v + block + x = x + 1 + y = x + end block + !$omp end atomic +end + +subroutine f02 + integer :: x, y + + ! The update and capture statements can be inside of a single BLOCK. + ! The end-directive is then optional. Expect no diagnostics. + !$omp atomic update capture + block + x = x + 1 + y = x + end block +end + +subroutine f03 + integer :: x + + !ERROR: In ATOMIC UPDATE operation with CAPTURE neither statement could be the capture + !$omp atomic update capture + x = x + 1 + x = x + 2 + !$omp end atomic +end + +subroutine f04 + integer :: x, v + + !$omp atomic update capture + !WARNING: In ATOMIC UPDATE operation with CAPTURE either statement could be the update and the capture, assuming the first one is the capture statement + v = x + x = v + !$omp end atomic +end + +subroutine f05 + integer :: x, v, z + + !$omp atomic update capture + !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read z + v = x + z = x + 1 + !$omp end atomic +end + +subroutine f06 + integer :: x, v, z + + !$omp atomic update capture + z = x + 1 + !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read z + v = x + !$omp end atomic +end diff --git a/flang/test/Semantics/OpenMP/atomic-update-only.f90 b/flang/test/Semantics/OpenMP/atomic-update-only.f90 new file mode 100644 index 0000000000000..28d0e264359cb --- /dev/null +++ b/flang/test/Semantics/OpenMP/atomic-update-only.f90 @@ -0,0 +1,83 @@ +!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=60 + +subroutine f00 + integer :: x, y + + ! The x is a direct argument of the + operator. Expect no diagnostics. + !$omp atomic update + x = x + (y - 1) +end + +subroutine f01 + integer :: x + + ! x + 0 is unusual, but legal. Expect no diagnostics. + !$omp atomic update + x = x + 0 +end + +subroutine f02 + integer :: x + + ! This is formally not allowed by the syntax restrictions of the spec, + ! but it's equivalent to either x+0 or x*1, both of which are legal. + ! Allow this case. Expect no diagnostics. + !$omp atomic update + x = x +end + +subroutine f03 + integer :: x, y + + !$omp atomic update + !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator + x = (x + y) + 1 +end + +subroutine f04 + integer :: x + real :: y + + !$omp atomic update + !ERROR: This intrinsic function is not a valid ATOMIC UPDATE operation + x = floor(x + y) +end + +subroutine f05 + integer :: x + real :: y + + ! An explicit conversion is accepted as an extension. + !$omp atomic update + x = int(x + y) +end + +subroutine f06 + integer :: x, y + interface + function f(i, j) + integer :: f, i, j + end + end interface + + !$omp atomic update + !ERROR: A call to this function is not a valid ATOMIC UPDATE operation + x = f(x, y) +end + +subroutine f07 + real :: x + integer :: y + + !$omp atomic update + !ERROR: The ** operator is not a valid ATOMIC UPDATE operation + x = x ** y +end + +subroutine f08 + integer :: x, y + + !$omp atomic update + !ERROR: The atomic variable x should appear as an argument in the update operation + x = y +end diff --git a/flang/test/Semantics/OpenMP/atomic-update-overloaded-ops.f90 b/flang/test/Semantics/OpenMP/atomic-update-overloaded-ops.f90 index 21a9b87d26345..3084376b4275d 100644 --- a/flang/test/Semantics/OpenMP/atomic-update-overloaded-ops.f90 +++ b/flang/test/Semantics/OpenMP/atomic-update-overloaded-ops.f90 @@ -22,10 +22,10 @@ program sample x = x / y !$omp atomic update - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: A call to this function is not a valid ATOMIC UPDATE operation x = x .MYOPERATOR. y !$omp atomic - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: A call to this function is not a valid ATOMIC UPDATE operation x = x .MYOPERATOR. y end program diff --git a/flang/test/Semantics/OpenMP/atomic-write.f90 b/flang/test/Semantics/OpenMP/atomic-write.f90 new file mode 100644 index 0000000000000..7965ad2dc7dbf --- /dev/null +++ b/flang/test/Semantics/OpenMP/atomic-write.f90 @@ -0,0 +1,81 @@ +!RUN: %python %S/../test_errors.py %s %flang -fopenmp -fopenmp-version=60 + +subroutine f00 + integer :: x, v + ! The end-directive is optional in ATOMIC WRITE. Expect no diagnostics. + !$omp atomic write + x = v + 1 + + !$omp atomic write + x = v + 3 + !$omp end atomic +end + +subroutine f01 + integer, pointer :: x, v + ! Intrinsic assignment and pointer assignment are both ok. Expect no + ! diagnostics. + !$omp atomic write + x = 2 * v + 3 + + !$omp atomic write + x => v +end + +subroutine f02(i) + integer :: i, v + interface + function p(i) + integer, pointer :: p + integer :: i + end + end interface + + ! Atomic variable can be a function reference. Expect no diagostics. + !$omp atomic write + p(i) = v +end + +subroutine f03 + integer :: x(3), y(5), v(3) + + !$omp atomic write + !ERROR: Atomic variable x should be a scalar + x = v + + !$omp atomic write + !ERROR: Atomic variable y(2_8:4_8:1_8) should be a scalar + y(2:4) = v +end + +subroutine f04 + integer :: x, y(3), v + + !$omp atomic write + !ERROR: Within atomic operation x and x+1_4 access the same storage + x = x + 1 + + ! Accessing same array, but not the same storage. Expect no diagnostics. + !$omp atomic write + y(1) = y(2) +end + +subroutine f06 + character :: x, v + + !$omp atomic write + !ERROR: Atomic variable x cannot have CHARACTER type + x = v +end + +subroutine f07 + integer, allocatable :: x + integer :: v + + allocate(x) + + !$omp atomic write + !ERROR: Atomic variable x cannot be ALLOCATABLE + x = v +end + diff --git a/flang/test/Semantics/OpenMP/atomic.f90 b/flang/test/Semantics/OpenMP/atomic.f90 index 0e100871ea9b4..10b33a3ade22d 100644 --- a/flang/test/Semantics/OpenMP/atomic.f90 +++ b/flang/test/Semantics/OpenMP/atomic.f90 @@ -1,4 +1,6 @@ -! RUN: %python %S/../test_errors.py %s %flang -fopenmp +! REQUIRES: openmp_runtime + +! RUN: %python %S/../test_errors.py %s %flang -fopenmp %openmp_flags use omp_lib ! Check OpenMP 2.13.6 atomic Construct @@ -11,9 +13,13 @@ a = b !$omp end atomic + !ERROR: ACQUIRE clause is not allowed on directive ATOMIC in OpenMP v3.1, try -fopenmp-version=50 + !ERROR: HINT clause is not allowed on directive ATOMIC in OpenMP v3.1, try -fopenmp-version=50 !$omp atomic read acquire hint(OMP_LOCK_HINT_CONTENDED) a = b + !ERROR: RELEASE clause is not allowed on directive ATOMIC in OpenMP v3.1, try -fopenmp-version=50 + !ERROR: HINT clause is not allowed on directive ATOMIC in OpenMP v3.1, try -fopenmp-version=50 !$omp atomic release hint(OMP_LOCK_HINT_UNCONTENDED) write a = b @@ -22,39 +28,32 @@ a = a + 1 !$omp end atomic + !ERROR: HINT clause is not allowed on directive ATOMIC in OpenMP v3.1, try -fopenmp-version=50 + !ERROR: ACQ_REL clause is not allowed on directive ATOMIC in OpenMP v3.1, try -fopenmp-version=50 !$omp atomic hint(1) acq_rel capture b = a a = a + 1 !$omp end atomic - !ERROR: expected end of line + !ERROR: At most one clause from the 'atomic' group is allowed on ATOMIC construct !$omp atomic read write + !ERROR: Atomic expression a+1._4 should be a variable a = a + 1 !$omp atomic a = a + 1 - !ERROR: expected 'UPDATE' - !ERROR: expected 'WRITE' - !ERROR: expected 'COMPARE' - !ERROR: expected 'CAPTURE' - !ERROR: expected 'READ' + !ERROR: NUM_THREADS clause is not allowed on the ATOMIC directive !$omp atomic num_threads(4) a = a + 1 - !ERROR: expected end of line + !ERROR: ATOMIC UPDATE operation with CAPTURE should contain two statements + !ERROR: NUM_THREADS clause is not allowed on the ATOMIC directive !$omp atomic capture num_threads(4) a = a + 1 + !ERROR: RELAXED clause is not allowed on directive ATOMIC in OpenMP v3.1, try -fopenmp-version=50 !$omp atomic relaxed a = a + 1 - !ERROR: expected 'UPDATE' - !ERROR: expected 'WRITE' - !ERROR: expected 'COMPARE' - !ERROR: expected 'CAPTURE' - !ERROR: expected 'READ' - !$omp atomic num_threads write - a = a + 1 - !$omp end parallel end diff --git a/flang/test/Semantics/OpenMP/atomic01.f90 b/flang/test/Semantics/OpenMP/atomic01.f90 index 173effe86b69c..f700c381cadd0 100644 --- a/flang/test/Semantics/OpenMP/atomic01.f90 +++ b/flang/test/Semantics/OpenMP/atomic01.f90 @@ -14,322 +14,277 @@ ! At most one memory-order-clause may appear on the construct. !READ - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the READ directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic seq_cst seq_cst read i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the READ directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic read seq_cst seq_cst i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the READ directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic seq_cst read seq_cst i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one ACQUIRE clause can appear on the READ directive + !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive !$omp atomic acquire acquire read i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one ACQUIRE clause can appear on the READ directive + !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive !$omp atomic read acquire acquire i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one ACQUIRE clause can appear on the READ directive + !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive !$omp atomic acquire read acquire i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the READ directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic relaxed relaxed read i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the READ directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic read relaxed relaxed i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the READ directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic relaxed read relaxed i = j !UPDATE - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the UPDATE directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic seq_cst seq_cst update - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the UPDATE directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic update seq_cst seq_cst - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the UPDATE directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic seq_cst update seq_cst - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELEASE clause can appear on the UPDATE directive + !ERROR: At most one RELEASE clause can appear on the ATOMIC directive !$omp atomic release release update - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELEASE clause can appear on the UPDATE directive + !ERROR: At most one RELEASE clause can appear on the ATOMIC directive !$omp atomic update release release - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELEASE clause can appear on the UPDATE directive + !ERROR: At most one RELEASE clause can appear on the ATOMIC directive !$omp atomic release update release - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the UPDATE directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic relaxed relaxed update - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the UPDATE directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic update relaxed relaxed - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the UPDATE directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic relaxed update relaxed - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j !CAPTURE - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the CAPTURE directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic seq_cst seq_cst capture i = j j = k !$omp end atomic - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the CAPTURE directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic capture seq_cst seq_cst i = j j = k !$omp end atomic - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the CAPTURE directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic seq_cst capture seq_cst i = j j = k !$omp end atomic - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELEASE clause can appear on the CAPTURE directive + !ERROR: At most one RELEASE clause can appear on the ATOMIC directive !$omp atomic release release capture i = j j = k !$omp end atomic - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELEASE clause can appear on the CAPTURE directive + !ERROR: At most one RELEASE clause can appear on the ATOMIC directive !$omp atomic capture release release i = j j = k !$omp end atomic - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELEASE clause can appear on the CAPTURE directive + !ERROR: At most one RELEASE clause can appear on the ATOMIC directive !$omp atomic release capture release i = j j = k !$omp end atomic - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the CAPTURE directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic relaxed relaxed capture i = j j = k !$omp end atomic - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the CAPTURE directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic capture relaxed relaxed i = j j = k !$omp end atomic - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the CAPTURE directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic relaxed capture relaxed i = j j = k !$omp end atomic - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one ACQ_REL clause can appear on the CAPTURE directive + !ERROR: At most one ACQ_REL clause can appear on the ATOMIC directive !$omp atomic acq_rel acq_rel capture i = j j = k !$omp end atomic - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one ACQ_REL clause can appear on the CAPTURE directive + !ERROR: At most one ACQ_REL clause can appear on the ATOMIC directive !$omp atomic capture acq_rel acq_rel i = j j = k !$omp end atomic - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one ACQ_REL clause can appear on the CAPTURE directive + !ERROR: At most one ACQ_REL clause can appear on the ATOMIC directive !$omp atomic acq_rel capture acq_rel i = j j = k !$omp end atomic - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one ACQUIRE clause can appear on the CAPTURE directive + !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive !$omp atomic acquire acquire capture i = j j = k !$omp end atomic - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one ACQUIRE clause can appear on the CAPTURE directive + !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive !$omp atomic capture acquire acquire i = j j = k !$omp end atomic - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one ACQUIRE clause can appear on the CAPTURE directive + !ERROR: At most one ACQUIRE clause can appear on the ATOMIC directive !$omp atomic acquire capture acquire i = j j = k !$omp end atomic !WRITE - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the WRITE directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic seq_cst seq_cst write i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the WRITE directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic write seq_cst seq_cst i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one SEQ_CST clause can appear on the WRITE directive + !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic seq_cst write seq_cst i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELEASE clause can appear on the WRITE directive + !ERROR: At most one RELEASE clause can appear on the ATOMIC directive !$omp atomic release release write i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELEASE clause can appear on the WRITE directive + !ERROR: At most one RELEASE clause can appear on the ATOMIC directive !$omp atomic write release release i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELEASE clause can appear on the WRITE directive + !ERROR: At most one RELEASE clause can appear on the ATOMIC directive !$omp atomic release write release i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the WRITE directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic relaxed relaxed write i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the WRITE directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic write relaxed relaxed i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct - !ERROR: At most one RELAXED clause can appear on the WRITE directive + !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic relaxed write relaxed i = j !No atomic-clause - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct !ERROR: At most one RELAXED clause can appear on the ATOMIC directive !$omp atomic relaxed relaxed - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct !ERROR: At most one SEQ_CST clause can appear on the ATOMIC directive !$omp atomic seq_cst seq_cst - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct !ERROR: At most one RELEASE clause can appear on the ATOMIC directive !$omp atomic release release - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j ! 2.17.7.3 ! At most one hint clause may appear on the construct. - !ERROR: At most one HINT clause can appear on the READ directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic hint(omp_sync_hint_speculative) hint(omp_sync_hint_speculative) read i = j - !ERROR: At most one HINT clause can appear on the READ directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic hint(omp_sync_hint_nonspeculative) read hint(omp_sync_hint_nonspeculative) i = j - !ERROR: At most one HINT clause can appear on the READ directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic read hint(omp_sync_hint_uncontended) hint (omp_sync_hint_uncontended) i = j - !ERROR: At most one HINT clause can appear on the WRITE directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic hint(omp_sync_hint_contended) hint(omp_sync_hint_speculative) write i = j - !ERROR: At most one HINT clause can appear on the WRITE directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic hint(omp_sync_hint_nonspeculative) write hint(omp_sync_hint_nonspeculative) i = j - !ERROR: At most one HINT clause can appear on the WRITE directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic write hint(omp_sync_hint_none) hint (omp_sync_hint_uncontended) i = j - !ERROR: At most one HINT clause can appear on the WRITE directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic hint(omp_sync_hint_contended) hint(omp_sync_hint_speculative) write i = j - !ERROR: At most one HINT clause can appear on the WRITE directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic hint(omp_sync_hint_nonspeculative) write hint(omp_sync_hint_nonspeculative) i = j - !ERROR: At most one HINT clause can appear on the WRITE directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic write hint(omp_sync_hint_none) hint (omp_sync_hint_uncontended) i = j - !ERROR: At most one HINT clause can appear on the UPDATE directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic hint(omp_sync_hint_contended) hint(omp_sync_hint_speculative) update - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: At most one HINT clause can appear on the UPDATE directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic hint(omp_sync_hint_nonspeculative) update hint(omp_sync_hint_nonspeculative) - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: At most one HINT clause can appear on the UPDATE directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic update hint(omp_sync_hint_none) hint (omp_sync_hint_uncontended) - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic hint(omp_sync_hint_contended) hint(omp_sync_hint_speculative) - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic hint(omp_sync_hint_none) hint(omp_sync_hint_nonspeculative) - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic hint(omp_sync_hint_none) hint (omp_sync_hint_uncontended) - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: At most one HINT clause can appear on the CAPTURE directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic hint(omp_sync_hint_contended) hint(omp_sync_hint_speculative) capture i = j j = k !$omp end atomic - !ERROR: At most one HINT clause can appear on the CAPTURE directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic hint(omp_sync_hint_nonspeculative) capture hint(omp_sync_hint_nonspeculative) i = j j = k !$omp end atomic - !ERROR: At most one HINT clause can appear on the CAPTURE directive + !ERROR: At most one HINT clause can appear on the ATOMIC directive !$omp atomic capture hint(omp_sync_hint_none) hint (omp_sync_hint_uncontended) i = j j = k @@ -337,34 +292,26 @@ ! 2.17.7.4 ! If atomic-clause is read then memory-order-clause must not be acq_rel or release. - !ERROR: Clause ACQ_REL is not allowed if clause READ appears on the ATOMIC directive !$omp atomic acq_rel read i = j - !ERROR: Clause ACQ_REL is not allowed if clause READ appears on the ATOMIC directive !$omp atomic read acq_rel i = j - !ERROR: Clause RELEASE is not allowed if clause READ appears on the ATOMIC directive !$omp atomic release read i = j - !ERROR: Clause RELEASE is not allowed if clause READ appears on the ATOMIC directive !$omp atomic read release i = j ! 2.17.7.5 ! If atomic-clause is write then memory-order-clause must not be acq_rel or acquire. - !ERROR: Clause ACQ_REL is not allowed if clause WRITE appears on the ATOMIC directive !$omp atomic acq_rel write i = j - !ERROR: Clause ACQ_REL is not allowed if clause WRITE appears on the ATOMIC directive !$omp atomic write acq_rel i = j - !ERROR: Clause ACQUIRE is not allowed if clause WRITE appears on the ATOMIC directive !$omp atomic acquire write i = j - !ERROR: Clause ACQUIRE is not allowed if clause WRITE appears on the ATOMIC directive !$omp atomic write acquire i = j @@ -372,33 +319,27 @@ ! 2.17.7.6 ! If atomic-clause is update or not present then memory-order-clause must not be acq_rel or acquire. - !ERROR: Clause ACQ_REL is not allowed if clause UPDATE appears on the ATOMIC directive !$omp atomic acq_rel update - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: Clause ACQ_REL is not allowed if clause UPDATE appears on the ATOMIC directive !$omp atomic update acq_rel - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: Clause ACQUIRE is not allowed if clause UPDATE appears on the ATOMIC directive !$omp atomic acquire update - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: Clause ACQUIRE is not allowed if clause UPDATE appears on the ATOMIC directive !$omp atomic update acquire - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: Clause ACQ_REL is not allowed on the ATOMIC directive !$omp atomic acq_rel - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j - !ERROR: Clause ACQUIRE is not allowed on the ATOMIC directive !$omp atomic acquire - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable i should appear as an argument in the update operation i = j end program diff --git a/flang/test/Semantics/OpenMP/atomic02.f90 b/flang/test/Semantics/OpenMP/atomic02.f90 index c66085d00f157..45e41f2552965 100644 --- a/flang/test/Semantics/OpenMP/atomic02.f90 +++ b/flang/test/Semantics/OpenMP/atomic02.f90 @@ -28,36 +28,29 @@ program OmpAtomic !$omp atomic a = a/(b + 1) !$omp atomic - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The ** operator is not a valid ATOMIC UPDATE operation a = a**4 !$omp atomic - !ERROR: Expected scalar variable on the LHS of atomic update assignment statement - !ERROR: Invalid or missing operator in atomic update statement - !ERROR: Expected scalar expression on the RHS of atomic update assignment statement + !ERROR: Atomic variable c cannot have CHARACTER type + !ERROR: The atomic variable c should appear as an argument in the update operation c = d !$omp atomic - !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l` - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The < operator is not a valid ATOMIC UPDATE operation l = a .LT. b !$omp atomic - !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l` - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The <= operator is not a valid ATOMIC UPDATE operation l = a .LE. b !$omp atomic - !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l` - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The == operator is not a valid ATOMIC UPDATE operation l = a .EQ. b !$omp atomic - !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l` - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The /= operator is not a valid ATOMIC UPDATE operation l = a .NE. b !$omp atomic - !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l` - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The >= operator is not a valid ATOMIC UPDATE operation l = a .GE. b !$omp atomic - !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l` - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The > operator is not a valid ATOMIC UPDATE operation l = a .GT. b !$omp atomic m = m .AND. n @@ -76,32 +69,26 @@ program OmpAtomic !$omp atomic update a = a/(b + 1) !$omp atomic update - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The ** operator is not a valid ATOMIC UPDATE operation a = a**4 !$omp atomic update - !ERROR: Expected scalar variable on the LHS of atomic update assignment statement - !ERROR: Invalid or missing operator in atomic update statement - !ERROR: Expected scalar expression on the RHS of atomic update assignment statement + !ERROR: Atomic variable c cannot have CHARACTER type + !ERROR: This is not a valid ATOMIC UPDATE operation c = c//d !$omp atomic update - !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l` - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The < operator is not a valid ATOMIC UPDATE operation l = a .LT. b !$omp atomic update - !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l` - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The <= operator is not a valid ATOMIC UPDATE operation l = a .LE. b !$omp atomic update - !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l` - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The == operator is not a valid ATOMIC UPDATE operation l = a .EQ. b !$omp atomic update - !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l` - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The >= operator is not a valid ATOMIC UPDATE operation l = a .GE. b !$omp atomic update - !ERROR: Atomic update statement should be of form `l = l operator expr` OR `l = expr operator l` - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The > operator is not a valid ATOMIC UPDATE operation l = a .GT. b !$omp atomic update m = m .AND. n diff --git a/flang/test/Semantics/OpenMP/atomic03.f90 b/flang/test/Semantics/OpenMP/atomic03.f90 index 76367495b9861..b3a3c0d5e7a14 100644 --- a/flang/test/Semantics/OpenMP/atomic03.f90 +++ b/flang/test/Semantics/OpenMP/atomic03.f90 @@ -25,28 +25,26 @@ program OmpAtomic y = MIN(y, 8) !$omp atomic - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z' + !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level AND operator z = IAND(y, 4) !$omp atomic - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z' + !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level OR operator z = IOR(y, 5) !$omp atomic - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z' + !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level NEQV/EOR operator z = IEOR(y, 6) !$omp atomic - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z' + !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level MAX operator z = MAX(y, 7, b, c) !$omp atomic - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z' + !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level MIN operator z = MIN(y, 8, a, d) !$omp atomic - !ERROR: Invalid intrinsic procedure name in OpenMP ATOMIC (UPDATE) statement - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'y' + !ERROR: This intrinsic function is not a valid ATOMIC UPDATE operation y = FRACTION(x) !$omp atomic - !ERROR: Invalid intrinsic procedure name in OpenMP ATOMIC (UPDATE) statement - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'y' + !ERROR: The atomic variable y should appear as an argument in the update operation y = REAL(x) !$omp atomic update y = IAND(y, 4) @@ -60,26 +58,26 @@ program OmpAtomic y = MIN(y, 8) !$omp atomic update - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z' + !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level AND operator z = IAND(y, 4) !$omp atomic update - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z' + !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level OR operator z = IOR(y, 5) !$omp atomic update - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z' + !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level NEQV/EOR operator z = IEOR(y, 6) !$omp atomic update - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z' + !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level MAX operator z = MAX(y, 7) !$omp atomic update - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z' + !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level MIN operator z = MIN(y, 8) !$omp atomic update - !ERROR: Invalid intrinsic procedure name in OpenMP ATOMIC (UPDATE) statement + !ERROR: This intrinsic function is not a valid ATOMIC UPDATE operation y = MOD(y, 9) !$omp atomic update - !ERROR: Invalid intrinsic procedure name in OpenMP ATOMIC (UPDATE) statement + !ERROR: This intrinsic function is not a valid ATOMIC UPDATE operation x = ABS(x) end program OmpAtomic @@ -92,7 +90,7 @@ subroutine conflicting_types() type(simple) ::s z = 1 !$omp atomic - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'z' + !ERROR: The atomic variable z should occur exactly once among the arguments of the top-level AND operator z = IAND(s%z, 4) end subroutine @@ -105,40 +103,37 @@ subroutine more_invalid_atomic_update_stmts() type(some_type) :: s !$omp atomic update - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'a' + !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level MIN operator a = min(a, a, b) !$omp atomic - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'a' + !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level MAX operator a = max(b, a, b, a) !$omp atomic - !ERROR: Atomic update statement should be of the form `a = intrinsic_procedure(a, expr_list)` OR `a = intrinsic_procedure(expr_list, a)` a = min(b, a, b) !$omp atomic - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'a' + !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level MAX operator a = max(b, a, b, a, b) !$omp atomic update - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'y' + !ERROR: The atomic variable y should occur exactly once among the arguments of the top-level MIN operator y = min(z, x) !$omp atomic z = max(z, y) !$omp atomic update - !ERROR: Expected scalar variable on the LHS of atomic update assignment statement - !ERROR: Intrinsic procedure arguments in atomic update statement must have exactly one occurence of 'k' + !ERROR: Atomic variable k should be a scalar + !ERROR: The atomic variable k should occur exactly once among the arguments of the top-level MAX operator k = max(x, y) - + !$omp atomic !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar REAL(4) and rank 1 array of REAL(4) - !ERROR: Expected scalar expression on the RHS of atomic update assignment statement x = min(x, k) !$omp atomic !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar REAL(4) and rank 1 array of REAL(4) - !ERROR: Expected scalar expression on the RHS of atomic update assignment statement - z =z + s%m + z = z + s%m end subroutine diff --git a/flang/test/Semantics/OpenMP/atomic04.f90 b/flang/test/Semantics/OpenMP/atomic04.f90 index a9644ad95aa30..0f69befed1414 100644 --- a/flang/test/Semantics/OpenMP/atomic04.f90 +++ b/flang/test/Semantics/OpenMP/atomic04.f90 @@ -1,5 +1,3 @@ -! REQUIRES: openmp_runtime - ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags ! OpenMP Atomic construct @@ -7,7 +5,6 @@ ! Update assignment must be 'var = var op expr' or 'var = expr op var' program OmpAtomic - use omp_lib real x integer y logical m, n, l @@ -20,12 +17,10 @@ program OmpAtomic !$omp atomic x = 1 + x !$omp atomic - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator x = y + 1 !$omp atomic - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator x = 1 + y !$omp atomic @@ -33,12 +28,10 @@ program OmpAtomic !$omp atomic x = 1 - x !$omp atomic - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level - operator x = y - 1 !$omp atomic - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level - operator x = 1 - y !$omp atomic @@ -46,12 +39,10 @@ program OmpAtomic !$omp atomic x = 1*x !$omp atomic - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should appear as an argument in the update operation x = y*1 !$omp atomic - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should appear as an argument in the update operation x = 1*y !$omp atomic @@ -59,12 +50,10 @@ program OmpAtomic !$omp atomic x = 1/x !$omp atomic - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator x = y/1 !$omp atomic - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator x = 1/y !$omp atomic @@ -72,8 +61,7 @@ program OmpAtomic !$omp atomic m = n .AND. m !$omp atomic - !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m` - !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level AND operator m = n .AND. l !$omp atomic @@ -81,8 +69,7 @@ program OmpAtomic !$omp atomic m = n .OR. m !$omp atomic - !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m` - !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level OR operator m = n .OR. l !$omp atomic @@ -90,8 +77,7 @@ program OmpAtomic !$omp atomic m = n .EQV. m !$omp atomic - !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m` - !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level EQV operator m = n .EQV. l !$omp atomic @@ -99,8 +85,7 @@ program OmpAtomic !$omp atomic m = n .NEQV. m !$omp atomic - !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m` - !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level NEQV/EOR operator m = n .NEQV. l !$omp atomic update @@ -108,12 +93,10 @@ program OmpAtomic !$omp atomic update x = 1 + x !$omp atomic update - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator x = y + 1 !$omp atomic update - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level + operator x = 1 + y !$omp atomic update @@ -121,12 +104,10 @@ program OmpAtomic !$omp atomic update x = 1 - x !$omp atomic update - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level - operator x = y - 1 !$omp atomic update - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level - operator x = 1 - y !$omp atomic update @@ -134,12 +115,10 @@ program OmpAtomic !$omp atomic update x = 1*x !$omp atomic update - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should appear as an argument in the update operation x = y*1 !$omp atomic update - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should appear as an argument in the update operation x = 1*y !$omp atomic update @@ -147,12 +126,10 @@ program OmpAtomic !$omp atomic update x = 1/x !$omp atomic update - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator x = y/1 !$omp atomic update - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Exactly one occurence of 'x' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator x = 1/y !$omp atomic update @@ -160,8 +137,7 @@ program OmpAtomic !$omp atomic update m = n .AND. m !$omp atomic update - !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m` - !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level AND operator m = n .AND. l !$omp atomic update @@ -169,8 +145,7 @@ program OmpAtomic !$omp atomic update m = n .OR. m !$omp atomic update - !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m` - !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level OR operator m = n .OR. l !$omp atomic update @@ -178,8 +153,7 @@ program OmpAtomic !$omp atomic update m = n .EQV. m !$omp atomic update - !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m` - !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level EQV operator m = n .EQV. l !$omp atomic update @@ -187,8 +161,7 @@ program OmpAtomic !$omp atomic update m = n .NEQV. m !$omp atomic update - !ERROR: Atomic update statement should be of form `m = m operator expr` OR `m = expr operator m` - !ERROR: Exactly one occurence of 'm' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable m should occur exactly once among the arguments of the top-level NEQV/EOR operator m = n .NEQV. l end program OmpAtomic @@ -204,35 +177,34 @@ subroutine more_invalid_atomic_update_stmts() type(some_type) p !$omp atomic - !ERROR: Invalid or missing operator in atomic update statement x = x !$omp atomic update - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable x should appear as an argument in the update operation x = 1 !$omp atomic update - !ERROR: Exactly one occurence of 'a' expected on the RHS of atomic update assignment statement + !ERROR: Within atomic operation a and a*b access the same storage a = a * b + a !$omp atomic - !ERROR: Atomic update statement should be of form `a = a operator expr` OR `a = expr operator a` + !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level * operator a = b * (a + 9) !$omp atomic update - !ERROR: Exactly one occurence of 'a' expected on the RHS of atomic update assignment statement + !ERROR: Within atomic operation a and (a+b) access the same storage a = a * (a + b) !$omp atomic - !ERROR: Exactly one occurence of 'a' expected on the RHS of atomic update assignment statement + !ERROR: Within atomic operation a and (b+a) access the same storage a = (b + a) * a !$omp atomic - !ERROR: Atomic update statement should be of form `a = a operator expr` OR `a = expr operator a` + !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level + operator a = a * b + c !$omp atomic update - !ERROR: Atomic update statement should be of form `a = a operator expr` OR `a = expr operator a` + !ERROR: The atomic variable a should occur exactly once among the arguments of the top-level + operator a = a + b + c !$omp atomic @@ -243,23 +215,18 @@ subroutine more_invalid_atomic_update_stmts() !$omp atomic !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar INTEGER(4) and rank 1 array of INTEGER(4) - !ERROR: Expected scalar expression on the RHS of atomic update assignment statement a = a + d !$omp atomic update !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar REAL(4) and rank 1 array of REAL(4) - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` - !ERROR: Expected scalar expression on the RHS of atomic update assignment statement + !ERROR: The atomic variable x should occur exactly once among the arguments of the top-level / operator x = x * y / z !$omp atomic - !ERROR: Atomic update statement should be of form `p%m = p%m operator expr` OR `p%m = expr operator p%m` - !ERROR: Exactly one occurence of 'p%m' expected on the RHS of atomic update assignment statement + !ERROR: The atomic variable p%m should occur exactly once among the arguments of the top-level + operator p%m = x + y !$omp atomic update !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar REAL(4) and rank 1 array of REAL(4) - !ERROR: Expected scalar expression on the RHS of atomic update assignment statement - !ERROR: Exactly one occurence of 'p%m' expected on the RHS of atomic update assignment statement p%m = p%m + p%n end subroutine diff --git a/flang/test/Semantics/OpenMP/atomic05.f90 b/flang/test/Semantics/OpenMP/atomic05.f90 index 266268a212440..77ffc6e57f1a3 100644 --- a/flang/test/Semantics/OpenMP/atomic05.f90 +++ b/flang/test/Semantics/OpenMP/atomic05.f90 @@ -8,20 +8,20 @@ program OmpAtomic use omp_lib integer :: g, x - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct + !ERROR: At most one clause from the 'memory-order' group is allowed on ATOMIC construct !$omp atomic relaxed, seq_cst x = x + 1 - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct + !ERROR: At most one clause from the 'memory-order' group is allowed on ATOMIC construct !$omp atomic read seq_cst, relaxed x = g - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct + !ERROR: At most one clause from the 'memory-order' group is allowed on ATOMIC construct !$omp atomic write relaxed, release x = 2 * 4 - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct + !ERROR: At most one clause from the 'memory-order' group is allowed on ATOMIC construct !$omp atomic update release, seq_cst - !ERROR: Invalid or missing operator in atomic update statement + !ERROR: The atomic variable x should appear as an argument in the update operation x = 10 - !ERROR: More than one memory order clause not allowed on OpenMP ATOMIC construct + !ERROR: At most one clause from the 'memory-order' group is allowed on ATOMIC construct !$omp atomic capture release, seq_cst x = g g = x * 10 diff --git a/flang/test/Semantics/OpenMP/critical-hint-clause.f90 b/flang/test/Semantics/OpenMP/critical-hint-clause.f90 index 7ca8c858239f7..e9cfa49bf934e 100644 --- a/flang/test/Semantics/OpenMP/critical-hint-clause.f90 +++ b/flang/test/Semantics/OpenMP/critical-hint-clause.f90 @@ -18,7 +18,7 @@ program sample y = 2 !$omp end critical (name) - !ERROR: Hint clause value is not a valid OpenMP synchronization value + !ERROR: The synchronization hint is not valid !$omp critical (name) hint(3) y = 2 !$omp end critical (name) @@ -27,12 +27,12 @@ program sample y = 2 !$omp end critical (name) - !ERROR: Hint clause value is not a valid OpenMP synchronization value + !ERROR: The synchronization hint is not valid !$omp critical (name) hint(7) y = 2 !$omp end critical (name) - !ERROR: Hint clause must have non-negative constant integer expression + !ERROR: Synchronization hint must be a constant integer value !ERROR: Must be a constant value !$omp critical (name) hint(x) y = 2 @@ -54,7 +54,7 @@ program sample y = 2 !$omp end critical (name) - !ERROR: Hint clause must have non-negative constant integer expression + !ERROR: Synchronization hint must be a constant integer value !ERROR: Must be a constant value !$omp critical (name) hint(omp_sync_hint_uncontended + omp_sync_hint) y = 2 @@ -84,35 +84,35 @@ program sample y = 2 !$omp end critical (name) - !ERROR: Hint clause value is not a valid OpenMP synchronization value + !ERROR: The synchronization hint is not valid !$omp critical (name) hint(omp_sync_hint_uncontended + omp_sync_hint_contended) y = 2 !$omp end critical (name) - !ERROR: Hint clause value is not a valid OpenMP synchronization value + !ERROR: The synchronization hint is not valid !$omp critical (name) hint(omp_sync_hint_nonspeculative + omp_lock_hint_speculative) y = 2 !$omp end critical (name) - !ERROR: Hint clause must have non-negative constant integer expression + !ERROR: Synchronization hint must be a constant integer value !ERROR: Must have INTEGER type, but is REAL(4) !$omp critical (name) hint(1.0) y = 2 !$omp end critical (name) - !ERROR: Hint clause must have non-negative constant integer expression + !ERROR: Synchronization hint must be a constant integer value !ERROR: Operands of + must be numeric; have LOGICAL(4) and INTEGER(4) !$omp critical (name) hint(z + omp_sync_hint_nonspeculative) y = 2 !$omp end critical (name) - !ERROR: Hint clause must have non-negative constant integer expression + !ERROR: Synchronization hint must be a constant integer value !ERROR: Must be a constant value !$omp critical (name) hint(k + omp_sync_hint_speculative) y = 2 !$omp end critical (name) - !ERROR: Hint clause must have non-negative constant integer expression + !ERROR: Synchronization hint must be a constant integer value !ERROR: Must be a constant value !$omp critical (name) hint(p(1) + omp_sync_hint_uncontended) y = 2 diff --git a/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90 b/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90 index 505cbc48fef90..8fdd2aed3ec1f 100644 --- a/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90 +++ b/flang/test/Semantics/OpenMP/omp-atomic-assignment-stmt.f90 @@ -20,70 +20,64 @@ program sample !$omp atomic read !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar INTEGER(4) and rank 1 array of INTEGER(4) - !ERROR: Expected scalar expression on the RHS of atomic assignment statement + !ERROR: Atomic variable y(1_8:3_8:1_8) should be a scalar v = y(1:3) !$omp atomic read - !ERROR: Expected scalar variable of intrinsic type on RHS of atomic assignment statement + !ERROR: Atomic expression x*(10_4+x) should be a variable v = x * (10 + x) !$omp atomic read - !ERROR: Expected scalar variable of intrinsic type on RHS of atomic assignment statement + !ERROR: Atomic expression 4_4 should be a variable v = 4 !$omp atomic read - !ERROR: k must not have ALLOCATABLE attribute + !ERROR: Atomic variable k cannot be ALLOCATABLE v = k !$omp atomic write - !ERROR: k must not have ALLOCATABLE attribute + !ERROR: Atomic variable k cannot be ALLOCATABLE k = x !$omp atomic update - !ERROR: k must not have ALLOCATABLE attribute + !ERROR: Atomic variable k cannot be ALLOCATABLE k = k + x * (v * x) !$omp atomic - !ERROR: k must not have ALLOCATABLE attribute + !ERROR: Atomic variable k cannot be ALLOCATABLE k = v * k !$omp atomic write - !ERROR: RHS expression on atomic assignment statement cannot access 'z%y' + !ERROR: Within atomic operation z%y and x+z%y access the same storage z%y = x + z%y !$omp atomic write - !ERROR: RHS expression on atomic assignment statement cannot access 'x' + !ERROR: Within atomic operation x and x access the same storage x = x !$omp atomic write - !ERROR: RHS expression on atomic assignment statement cannot access 'm' + !ERROR: Within atomic operation m and min(m,x,z%m)+k access the same storage m = min(m, x, z%m) + k !$omp atomic read - !ERROR: RHS expression on atomic assignment statement cannot access 'x' + !ERROR: Within atomic operation x and x access the same storage x = x !$omp atomic read - !ERROR: Expected scalar variable of intrinsic type on RHS of atomic assignment statement - !ERROR: RHS expression on atomic assignment statement cannot access 'm' + !ERROR: Atomic expression min(m,x,z%m)+k should be a variable m = min(m, x, z%m) + k !$omp atomic read !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar INTEGER(4) and rank 1 array of INTEGER(4) - !ERROR: Expected scalar expression on the RHS of atomic assignment statement + !ERROR: Atomic variable a should be a scalar x = a - !$omp atomic read - !ERROR: Expected scalar variable on the LHS of atomic assignment statement - a = x - !$omp atomic write !ERROR: No intrinsic or user-defined ASSIGNMENT(=) matches scalar INTEGER(4) and rank 1 array of INTEGER(4) - !ERROR: Expected scalar expression on the RHS of atomic assignment statement x = a !$omp atomic write - !ERROR: Expected scalar variable on the LHS of atomic assignment statement + !ERROR: Atomic variable a should be a scalar a = x !$omp atomic capture @@ -93,7 +87,7 @@ program sample !$omp atomic release capture v = x - !ERROR: Atomic update statement should be of form `x = x operator expr` OR `x = expr operator x` + ! This ends up being "x = b + x". x = b + (x*1) !$omp end atomic @@ -103,60 +97,58 @@ program sample !$omp end atomic !$omp atomic capture - !ERROR: Captured variable/array element/derived-type component x expected to be assigned in the second statement of ATOMIC CAPTURE construct + !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read b v = x b = b + 1 !$omp end atomic !$omp atomic capture - !ERROR: Captured variable/array element/derived-type component x expected to be assigned in the second statement of ATOMIC CAPTURE construct + !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read b v = x b = 10 !$omp end atomic !$omp atomic capture - !ERROR: Updated variable/array element/derived-type component x expected to be captured in the second statement of ATOMIC CAPTURE construct x = x + 10 + !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read x v = b !$omp end atomic + !ERROR: In ATOMIC UPDATE operation with CAPTURE neither statement could be the update or the capture !$omp atomic capture - !ERROR: Invalid ATOMIC CAPTURE construct statements. Expected one of [update-stmt, capture-stmt], [capture-stmt, update-stmt], or [capture-stmt, write-stmt] v = 1 x = 4 !$omp end atomic !$omp atomic capture - !ERROR: Captured variable/array element/derived-type component z%y expected to be assigned in the second statement of ATOMIC CAPTURE construct + !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read z%m x = z%y z%m = z%m + 1.0 !$omp end atomic !$omp atomic capture - !ERROR: Updated variable/array element/derived-type component z%m expected to be captured in the second statement of ATOMIC CAPTURE construct z%m = z%m + 1.0 + !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read z%m x = z%y !$omp end atomic !$omp atomic capture - !ERROR: Captured variable/array element/derived-type component y(2) expected to be assigned in the second statement of ATOMIC CAPTURE construct + !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read y(1_8) x = y(2) y(1) = y(1) + 1 !$omp end atomic !$omp atomic capture - !ERROR: Updated variable/array element/derived-type component y(1) expected to be captured in the second statement of ATOMIC CAPTURE construct y(1) = y(1) + 1 + !ERROR: In ATOMIC UPDATE operation with CAPTURE the right-hand side of the capture assignment should read y(1_8) x = y(2) !$omp end atomic !$omp atomic read - !ERROR: Expected scalar variable on the LHS of atomic assignment statement - !ERROR: Expected scalar expression on the RHS of atomic assignment statement + !ERROR: Atomic variable r cannot have CHARACTER type l = r !$omp atomic write - !ERROR: Expected scalar variable on the LHS of atomic assignment statement - !ERROR: Expected scalar expression on the RHS of atomic assignment statement + !ERROR: Atomic variable l cannot have CHARACTER type l = r end program diff --git a/flang/test/Semantics/OpenMP/requires-atomic01.f90 b/flang/test/Semantics/OpenMP/requires-atomic01.f90 index ae9fd086015dd..e8817c3f5ef61 100644 --- a/flang/test/Semantics/OpenMP/requires-atomic01.f90 +++ b/flang/test/Semantics/OpenMP/requires-atomic01.f90 @@ -10,20 +10,23 @@ program requires ! READ ! ---------------------------------------------------------------------------- - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicRead - ! CHECK: OmpMemoryOrderClause -> OmpClause -> SeqCst + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Read + ! CHECK: OmpClause -> SeqCst !$omp atomic read i = j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicRead - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK-NOT: OmpClause -> SeqCst + ! CHECK: OmpClause -> Relaxed + ! CHECK: OmpClause -> Read !$omp atomic relaxed read i = j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicRead - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Read + ! CHECK-NOT: OmpClause -> SeqCst + ! CHECK: OmpClause -> Relaxed !$omp atomic read relaxed i = j @@ -31,20 +34,23 @@ program requires ! WRITE ! ---------------------------------------------------------------------------- - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicWrite - ! CHECK: OmpMemoryOrderClause -> OmpClause -> SeqCst + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Write + ! CHECK: OmpClause -> SeqCst !$omp atomic write i = j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicWrite - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK-NOT: OmpClause -> SeqCst + ! CHECK: OmpClause -> Relaxed + ! CHECK: OmpClause -> Write !$omp atomic relaxed write i = j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicWrite - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Write + ! CHECK-NOT: OmpClause -> SeqCst + ! CHECK: OmpClause -> Relaxed !$omp atomic write relaxed i = j @@ -52,31 +58,34 @@ program requires ! UPDATE ! ---------------------------------------------------------------------------- - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicUpdate - ! CHECK: OmpMemoryOrderClause -> OmpClause -> SeqCst + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Update + ! CHECK: OmpClause -> SeqCst !$omp atomic update i = i + j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicUpdate - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK-NOT: OmpClause -> SeqCst + ! CHECK: OmpClause -> Relaxed + ! CHECK: OmpClause -> Update !$omp atomic relaxed update i = i + j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicUpdate - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Update + ! CHECK-NOT: OmpClause -> SeqCst + ! CHECK: OmpClause -> Relaxed !$omp atomic update relaxed i = i + j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomic - ! CHECK: OmpMemoryOrderClause -> OmpClause -> SeqCst + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> SeqCst !$omp atomic i = i + j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomic - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK-NOT: OmpClause -> SeqCst + ! CHECK: OmpClause -> Relaxed !$omp atomic relaxed i = i + j @@ -84,24 +93,27 @@ program requires ! CAPTURE ! ---------------------------------------------------------------------------- - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicCapture - ! CHECK: OmpMemoryOrderClause -> OmpClause -> SeqCst + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Capture + ! CHECK: OmpClause -> SeqCst !$omp atomic capture i = j j = j + 1 !$omp end atomic - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicCapture - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK-NOT: OmpClause -> SeqCst + ! CHECK: OmpClause -> Relaxed + ! CHECK: OmpClause -> Capture !$omp atomic relaxed capture i = j j = j + 1 !$omp end atomic - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicCapture - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> SeqCst - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Capture + ! CHECK-NOT: OmpClause -> SeqCst + ! CHECK: OmpClause -> Relaxed !$omp atomic capture relaxed i = j j = j + 1 diff --git a/flang/test/Semantics/OpenMP/requires-atomic02.f90 b/flang/test/Semantics/OpenMP/requires-atomic02.f90 index 4976a9667eb78..a3724a83456fd 100644 --- a/flang/test/Semantics/OpenMP/requires-atomic02.f90 +++ b/flang/test/Semantics/OpenMP/requires-atomic02.f90 @@ -10,20 +10,23 @@ program requires ! READ ! ---------------------------------------------------------------------------- - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicRead - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Acquire + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Read + ! CHECK: OmpClause -> AcqRel !$omp atomic read i = j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicRead - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> Acquire - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK-NOT: OmpClause -> AcqRel + ! CHECK: OmpClause -> Relaxed + ! CHECK: OmpClause -> Read !$omp atomic relaxed read i = j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicRead - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> Acquire - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Read + ! CHECK-NOT: OmpClause -> AcqRel + ! CHECK: OmpClause -> Relaxed !$omp atomic read relaxed i = j @@ -31,20 +34,23 @@ program requires ! WRITE ! ---------------------------------------------------------------------------- - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicWrite - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Release + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Write + ! CHECK: OmpClause -> AcqRel !$omp atomic write i = j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicWrite - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> Release - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK-NOT: OmpClause -> AcqRel + ! CHECK: OmpClause -> Relaxed + ! CHECK: OmpClause -> Write !$omp atomic relaxed write i = j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicWrite - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> Release - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Write + ! CHECK-NOT: OmpClause -> AcqRel + ! CHECK: OmpClause -> Relaxed !$omp atomic write relaxed i = j @@ -52,31 +58,34 @@ program requires ! UPDATE ! ---------------------------------------------------------------------------- - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicUpdate - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Release + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Update + ! CHECK: OmpClause -> AcqRel !$omp atomic update i = i + j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicUpdate - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> Release - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK-NOT: OmpClause -> AcqRel + ! CHECK: OmpClause -> Relaxed + ! CHECK: OmpClause -> Update !$omp atomic relaxed update i = i + j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicUpdate - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> Release - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Update + ! CHECK-NOT: OmpClause -> AcqRel + ! CHECK: OmpClause -> Relaxed !$omp atomic update relaxed i = i + j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomic - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Release + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> AcqRel !$omp atomic i = i + j - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomic - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> Release - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK-NOT: OmpClause -> AcqRel + ! CHECK: OmpClause -> Relaxed !$omp atomic relaxed i = i + j @@ -84,24 +93,27 @@ program requires ! CAPTURE ! ---------------------------------------------------------------------------- - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicCapture - ! CHECK: OmpMemoryOrderClause -> OmpClause -> AcqRel + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK: OmpClause -> Capture + ! CHECK: OmpClause -> AcqRel !$omp atomic capture i = j j = j + 1 !$omp end atomic - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicCapture - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> AcqRel - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK-NOT: OmpClause -> AcqRel + ! CHECK: OmpClause -> Relaxed + ! CHECK: OmpClause -> Capture !$omp atomic relaxed capture i = j j = j + 1 !$omp end atomic - ! CHECK-LABEL: OpenMPAtomicConstruct -> OmpAtomicCapture - ! CHECK-NOT: OmpMemoryOrderClause -> OmpClause -> AcqRel - ! CHECK: OmpMemoryOrderClause -> OmpClause -> Relaxed + ! CHECK-LABEL: OpenMPAtomicConstruct + ! CHECK-NOT: OmpClause -> AcqRel + ! CHECK: OmpClause -> Capture + ! CHECK: OmpClause -> Relaxed !$omp atomic capture relaxed i = j j = j + 1 From e64f8e043cdfc394fd31e157c8c5fb25ca85bd2f Mon Sep 17 00:00:00 2001 From: Krzysztof Parzyszek Date: Wed, 11 Jun 2025 10:17:54 -0500 Subject: [PATCH 092/851] [flang][Driver] Guard check for pic/pie settings without driver flags (#143530) The default relocation model for clang depends on the cmake flag CLANG_DEFAULT_PIE_ON_LINUX. By default it is set to ON, but when it's OFF, the default relocation model will be "static". The outcome of the test running clang without any PIC/PIE flags will depend on the cmake flag, so make sure it only runs when the flag is ON. --- flang/test/Driver/pic-flags.f90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flang/test/Driver/pic-flags.f90 b/flang/test/Driver/pic-flags.f90 index cb62d353cc18c..5a06163c485cd 100644 --- a/flang/test/Driver/pic-flags.f90 +++ b/flang/test/Driver/pic-flags.f90 @@ -1,6 +1,6 @@ ! RUN: %if aarch64-registered-target %{ %flang -v -S -emit-llvm -o - %s --target=aarch64-linux-gnu -fno-pie 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-STATIC,CHECK-STATIC-IR %} -! RUN: %if aarch64-registered-target %{ %flang -v -S -emit-llvm -o - %s --target=aarch64-linux-gnu 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PIE-LEVEL2,CHECK-PIE-LEVEL2-IR %} +! RUN: %if aarch64-registered-target && clang_default_pie_on_linux %{ %flang -v -S -emit-llvm -o - %s --target=aarch64-linux-gnu 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PIE-LEVEL2,CHECK-PIE-LEVEL2-IR %} ! RUN: %if aarch64-registered-target %{ %flang -v -S -emit-llvm -o - %s --target=aarch64-linux-gnu -fpie 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PIE-LEVEL1,CHECK-PIE-LEVEL1-IR %} ! RUN: %if aarch64-registered-target %{ %flang -v -S -emit-llvm -o - %s --target=aarch64-linux-gnu -fPIE 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-PIE-LEVEL2,CHECK-PIE-LEVEL2-IR %} From adfea33f0c412b8475b755a8d82c9961b785eb02 Mon Sep 17 00:00:00 2001 From: Lei Huang Date: Wed, 11 Jun 2025 11:28:48 -0400 Subject: [PATCH 093/851] [PowerPC][AIX] xfail atan-intrinsic to unblock bot (#143723) Testcase from https://github.com/llvm/llvm-project/pull/143416 is causing the AIX bot to be red. XFAIL for now till issue can be resolved. --- llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll b/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll index d824d6d35643d..c5c17d65524c2 100644 --- a/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll +++ b/llvm/test/Transforms/InstSimplify/ConstProp/atan-intrinsic.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 ; RUN: opt -S -passes=instsimplify < %s | FileCheck %s +; XFAIL: target={{.*}}-aix{{.*}} define double @test_atan_0() { ; CHECK-LABEL: define double @test_atan_0() { From bc9f4edf47d2cbed3b1ba7a61d1497dded91ed22 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Wed, 11 Jun 2025 16:44:09 +0100 Subject: [PATCH 094/851] [LTO] Fix used before intialised warning (#143705) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For whatever reason I can't reproduce this locally but I can on Compiler Explorer (https://godbolt.org/z/nfv4b83q6) and on our flang gcc bot (https://lab.llvm.org/buildbot/#/builders/130/builds/13683/steps/5/logs/stdio). In file included from ../llvm-project/llvm/include/llvm/LTO/LTO.h:33, from ../llvm-project/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp:29: ../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h: In constructor ‘llvm::FunctionImporter::ImportListsTy::ImportListsTy()’: ../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h:275:33: warning: member ‘llvm::FunctionImporter::ImportListsTy::ImportIDs’ is used uninitialized [-Wuninitialized] 275 | ImportListsTy() : EmptyList(ImportIDs) {} | ^~~~~~~~~ ../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h: In constructor ‘llvm::FunctionImporter::ImportListsTy::ImportListsTy(size_t)’: ../llvm-project/llvm/include/llvm/Transforms/IPO/FunctionImport.h:276:44: warning: member ‘llvm::FunctionImporter::ImportListsTy::ImportIDs’ is used uninitialized [-Wuninitialized] 276 | ImportListsTy(size_t Size) : EmptyList(ImportIDs), ListsImpl(Size) {} | ^~~~~~~~~ ImportIDs was being used during construction of EmptyList, before ImportIDs itself had been constructed. --- llvm/include/llvm/Transforms/IPO/FunctionImport.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/llvm/include/llvm/Transforms/IPO/FunctionImport.h b/llvm/include/llvm/Transforms/IPO/FunctionImport.h index 65228bb65ba8b..e6ae9ee831d50 100644 --- a/llvm/include/llvm/Transforms/IPO/FunctionImport.h +++ b/llvm/include/llvm/Transforms/IPO/FunctionImport.h @@ -272,8 +272,9 @@ class FunctionImporter { // A map from destination modules to lists of imports. class ImportListsTy { public: - ImportListsTy() : EmptyList(ImportIDs) {} - ImportListsTy(size_t Size) : EmptyList(ImportIDs), ListsImpl(Size) {} + ImportListsTy() : ImportIDs(), EmptyList(ImportIDs) {} + ImportListsTy(size_t Size) + : ImportIDs(), EmptyList(ImportIDs), ListsImpl(Size) {} ImportMapTy &operator[](StringRef DestMod) { return ListsImpl.try_emplace(DestMod, ImportIDs).first->second; @@ -293,9 +294,9 @@ class FunctionImporter { const_iterator end() const { return ListsImpl.end(); } private: + ImportIDTable ImportIDs; ImportMapTy EmptyList; DenseMap ListsImpl; - ImportIDTable ImportIDs; }; /// The set contains an entry for every global value that the module exports. From 91be47dccfa3480c152916838404d49107fde45c Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 08:53:54 -0700 Subject: [PATCH 095/851] [flang] Fix warnings This patch fixes: flang/lib/Lower/OpenMP/OpenMP.cpp:3904:9: error: unused variable 'action0' [-Werror,-Wunused-variable] flang/lib/Lower/OpenMP/OpenMP.cpp:3905:9: error: unused variable 'action1' [-Werror,-Wunused-variable] --- flang/lib/Lower/OpenMP/OpenMP.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp index 3f3b85696db31..c13fa471978db 100644 --- a/flang/lib/Lower/OpenMP/OpenMP.cpp +++ b/flang/lib/Lower/OpenMP/OpenMP.cpp @@ -3911,6 +3911,8 @@ static void genOMP(lower::AbstractConverter &converter, lower::SymMap &symTable, // Capturing operation. assert(action0 != analysis.None && action1 != analysis.None && "Expexcing two actions"); + (void)action0; + (void)action1; captureOp = builder.create(loc, hint, memOrder); // Set the non-atomic insertion point to before the atomic.capture. From 2ab83e9f68f0c7b1a7199455d7ce05430d93fa44 Mon Sep 17 00:00:00 2001 From: Tony Varghese Date: Wed, 11 Jun 2025 21:28:26 +0530 Subject: [PATCH 096/851] [NFC][PowerPC] Rename xxevalPattern to adhere to naming convention. (#143675) Rename class `xxevalPattern` to adhere to naming convention listed in the coding guideline and used for all other classes in the td file. --- llvm/lib/Target/PowerPC/PPCInstrP10.td | 62 +++++++++++++------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/llvm/lib/Target/PowerPC/PPCInstrP10.td b/llvm/lib/Target/PowerPC/PPCInstrP10.td index a7f758745efe2..d295f35fb1dd0 100644 --- a/llvm/lib/Target/PowerPC/PPCInstrP10.td +++ b/llvm/lib/Target/PowerPC/PPCInstrP10.td @@ -2159,7 +2159,7 @@ let AddedComplexity = 400, Predicates = [IsISA3_1, HasVSX] in { (COPY_TO_REGCLASS $VRB, VSRC), 2)))>; } -class xxevalPattern imm> : +class XXEvalPattern imm> : Pat<(v4i32 pattern), (XXEVAL $vA, $vB, $vC, imm)> {} let Predicates = [PrefixInstrs, HasP10Vector] in { @@ -2192,83 +2192,83 @@ let Predicates = [PrefixInstrs, HasP10Vector] in { // Anonymous patterns for XXEVAL // AND // and(A, B, C) - def : xxevalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>; + def : XXEvalPattern<(and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 1>; // and(A, xor(B, C)) - def : xxevalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>; + def : XXEvalPattern<(and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 6>; // and(A, or(B, C)) - def : xxevalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>; + def : XXEvalPattern<(and v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 7>; // and(A, nor(B, C)) - def : xxevalPattern<(and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>; + def : XXEvalPattern<(and v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 8>; // and(A, eqv(B, C)) - def : xxevalPattern<(and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>; + def : XXEvalPattern<(and v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 9>; // and(A, nand(B, C)) - def : xxevalPattern<(and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>; + def : XXEvalPattern<(and v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 14>; // NAND // nand(A, B, C) - def : xxevalPattern<(vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), + def : XXEvalPattern<(vnot (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), !sub(255, 1)>; // nand(A, xor(B, C)) - def : xxevalPattern<(vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), + def : XXEvalPattern<(vnot (and v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), !sub(255, 6)>; // nand(A, or(B, C)) - def : xxevalPattern<(vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), + def : XXEvalPattern<(vnot (and v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), !sub(255, 7)>; // nand(A, nor(B, C)) - def : xxevalPattern<(or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), + def : XXEvalPattern<(or (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), !sub(255, 8)>; // nand(A, eqv(B, C)) - def : xxevalPattern<(or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), + def : XXEvalPattern<(or (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), !sub(255, 9)>; // nand(A, nand(B, C)) - def : xxevalPattern<(or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), + def : XXEvalPattern<(or (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), !sub(255, 14)>; // EQV // (eqv A, B, C) - def : xxevalPattern<(or (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), + def : XXEvalPattern<(or (and v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), (vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)))), 150>; // (eqv A, (and B, C)) - def : xxevalPattern<(vnot (xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 225>; + def : XXEvalPattern<(vnot (xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 225>; // (eqv A, (or B, C)) - def : xxevalPattern<(vnot (xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 135>; + def : XXEvalPattern<(vnot (xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 135>; // NOR // (nor A, B, C) - def : xxevalPattern<(vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 128>; + def : XXEvalPattern<(vnot (or v4i32:$vA, (or v4i32:$vB, v4i32:$vC))), 128>; // (nor A, (and B, C)) - def : xxevalPattern<(vnot (or v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 224>; + def : XXEvalPattern<(vnot (or v4i32:$vA, (and v4i32:$vB, v4i32:$vC))), 224>; // (nor A, (eqv B, C)) - def : xxevalPattern<(and (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), 96>; + def : XXEvalPattern<(and (vnot v4i32:$vA), (xor v4i32:$vB, v4i32:$vC)), 96>; // (nor A, (nand B, C)) - def : xxevalPattern<(and (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), 16>; + def : XXEvalPattern<(and (vnot v4i32:$vA), (and v4i32:$vB, v4i32:$vC)), 16>; // (nor A, (nor B, C)) - def : xxevalPattern<(and (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), 112>; + def : XXEvalPattern<(and (vnot v4i32:$vA), (or v4i32:$vB, v4i32:$vC)), 112>; // (nor A, (xor B, C)) - def : xxevalPattern<(vnot (or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), 144>; + def : XXEvalPattern<(vnot (or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC))), 144>; // OR // (or A, B, C) - def : xxevalPattern<(or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 127>; + def : XXEvalPattern<(or v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 127>; // (or A, (and B, C)) - def : xxevalPattern<(or v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 31>; + def : XXEvalPattern<(or v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 31>; // (or A, (eqv B, C)) - def : xxevalPattern<(or v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 159>; + def : XXEvalPattern<(or v4i32:$vA, (vnot (xor v4i32:$vB, v4i32:$vC))), 159>; // (or A, (nand B, C)) - def : xxevalPattern<(or v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 239>; + def : XXEvalPattern<(or v4i32:$vA, (vnot (and v4i32:$vB, v4i32:$vC))), 239>; // (or A, (nor B, C)) - def : xxevalPattern<(or v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 143>; + def : XXEvalPattern<(or v4i32:$vA, (vnot (or v4i32:$vB, v4i32:$vC))), 143>; // (or A, (xor B, C)) - def : xxevalPattern<(or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 111>; + def : XXEvalPattern<(or v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 111>; // XOR // (xor A, B, C) - def : xxevalPattern<(xor v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 105>; + def : XXEvalPattern<(xor v4i32:$vA, (xor v4i32:$vB, v4i32:$vC)), 105>; // (xor A, (and B, C)) - def : xxevalPattern<(xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 30>; + def : XXEvalPattern<(xor v4i32:$vA, (and v4i32:$vB, v4i32:$vC)), 30>; // (xor A, (or B, C)) - def : xxevalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>; + def : XXEvalPattern<(xor v4i32:$vA, (or v4i32:$vB, v4i32:$vC)), 120>; // Anonymous patterns to select prefixed VSX loads and stores. // Load / Store f128 From 38fb0117ab10c4541e58697a4b56de2a646cf3f4 Mon Sep 17 00:00:00 2001 From: Peng Liu Date: Wed, 11 Jun 2025 12:13:36 -0400 Subject: [PATCH 097/851] [libc++] Make forward_list constexpr as part of P3372R3 (#129435) Fixes #128658 --- libcxx/docs/FeatureTestMacroTable.rst | 2 + libcxx/include/__memory/allocation_guard.h | 20 +- libcxx/include/__memory/pointer_traits.h | 16 +- libcxx/include/forward_list | 469 ++++++++++-------- libcxx/include/version | 2 + .../forwardlist/compare.three_way.pass.cpp | 7 +- .../sequences/forwardlist/empty.pass.cpp | 13 +- .../forwardlist.access/front.pass.cpp | 16 +- .../forwardlist.cons/alloc.compile.fail.cpp | 13 +- .../forwardlist.cons/alloc.pass.cpp | 13 +- .../forwardlist.cons/assign_copy.pass.cpp | 13 +- .../forwardlist.cons/assign_init.pass.cpp | 13 +- .../forwardlist.cons/assign_move.pass.cpp | 13 +- .../forwardlist.cons/assign_op_init.pass.cpp | 13 +- .../forwardlist.cons/assign_range.pass.cpp | 13 +- .../assign_size_value.pass.cpp | 13 +- .../forwardlist.cons/copy.pass.cpp | 13 +- .../forwardlist.cons/copy_alloc.pass.cpp | 13 +- .../forwardlist.cons/default.pass.cpp | 13 +- .../forwardlist.cons/from_range.pass.cpp | 19 +- .../forwardlist.cons/init.pass.cpp | 13 +- .../forwardlist.cons/init_alloc.pass.cpp | 13 +- .../forwardlist.cons/move.pass.cpp | 13 +- .../forwardlist.cons/move_alloc.pass.cpp | 13 +- .../forwardlist.cons/range.pass.cpp | 13 +- .../forwardlist.cons/range_alloc.pass.cpp | 13 +- .../forwardlist.cons/size.pass.cpp | 4 +- .../forwardlist.cons/size_value.pass.cpp | 13 +- .../size_value_alloc.pass.cpp | 13 +- .../forwardlist.erasure/erase.pass.cpp | 18 +- .../forwardlist.erasure/erase_if.pass.cpp | 18 +- .../forwardlist.iter/before_begin.pass.cpp | 17 +- .../forwardlist.iter/iterators.pass.cpp | 27 +- .../assign_range.pass.cpp | 19 +- .../forwardlist.modifiers/clear.pass.cpp | 13 +- .../emplace_after.pass.cpp | 13 +- .../emplace_front.pass.cpp | 13 +- .../erase_after_many.pass.cpp | 13 +- .../erase_after_one.pass.cpp | 13 +- .../insert_after_const.pass.cpp | 13 +- .../insert_after_init.pass.cpp | 13 +- .../insert_after_range.pass.cpp | 13 +- .../insert_after_rv.pass.cpp | 13 +- .../insert_after_size_value.pass.cpp | 13 +- .../insert_range_after.pass.cpp | 23 +- .../forwardlist.modifiers/pop_front.pass.cpp | 13 +- .../prepend_range.pass.cpp | 19 +- .../push_front_const.pass.cpp | 13 +- .../push_front_exception_safety.pass.cpp | 2 +- .../push_front_rv.pass.cpp | 13 +- .../resize_size.pass.cpp | 17 +- .../resize_size_value.pass.cpp | 15 +- .../forwardlist.ops/merge_lvalue.pass.cpp | 17 +- .../merge_lvalue_pred.pass.cpp | 17 +- .../forwardlist.ops/merge_rvalue.pass.cpp | 17 +- .../merge_rvalue_pred.pass.cpp | 17 +- .../forwardlist.ops/remove.pass.cpp | 27 +- .../forwardlist.ops/remove_if.pass.cpp | 25 +- .../forwardlist.ops/reverse.pass.cpp | 19 +- .../splice_after_flist.pass.cpp | 23 +- .../forwardlist.ops/splice_after_one.pass.cpp | 25 +- .../splice_after_range.pass.cpp | 27 +- .../forwardlist.ops/unique.pass.cpp | 15 +- .../forwardlist.ops/unique_pred.pass.cpp | 25 +- .../forwardlist.spec/equal.pass.cpp | 17 +- .../forwardlist.spec/member_swap.pass.cpp | 13 +- .../forwardlist.spec/non_member_swap.pass.cpp | 13 +- .../forwardlist.spec/relational.pass.cpp | 21 +- .../swap_noexcept.compile.pass.cpp | 4 +- .../forwardlist/get_allocator.pass.cpp | 13 +- .../sequences/forwardlist/incomplete.pass.cpp | 17 +- .../sequences/forwardlist/max_size.pass.cpp | 13 +- .../forward_list.version.compile.pass.cpp | 27 + .../version.version.compile.pass.cpp | 27 + libcxx/test/support/counting_predicates.h | 62 +-- .../generate_feature_test_macro_components.py | 5 + 76 files changed, 1186 insertions(+), 459 deletions(-) mode change 100755 => 100644 libcxx/utils/generate_feature_test_macro_components.py diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst index a89d4038785cd..3e6fd643f620c 100644 --- a/libcxx/docs/FeatureTestMacroTable.rst +++ b/libcxx/docs/FeatureTestMacroTable.rst @@ -420,6 +420,8 @@ Status ---------------------------------------------------------- ----------------- ``__cpp_lib_constexpr_algorithms`` ``202306L`` ---------------------------------------------------------- ----------------- + ``__cpp_lib_constexpr_forward_list`` ``202502L`` + ---------------------------------------------------------- ----------------- ``__cpp_lib_constexpr_new`` ``202406L`` ---------------------------------------------------------- ----------------- ``__cpp_lib_constexpr_queue`` ``202502L`` diff --git a/libcxx/include/__memory/allocation_guard.h b/libcxx/include/__memory/allocation_guard.h index 66edcd92ed618..016e1a3a429b8 100644 --- a/libcxx/include/__memory/allocation_guard.h +++ b/libcxx/include/__memory/allocation_guard.h @@ -49,24 +49,26 @@ struct __allocation_guard { using _Size _LIBCPP_NODEBUG = typename allocator_traits<_Alloc>::size_type; template // we perform the allocator conversion inside the constructor - _LIBCPP_HIDE_FROM_ABI explicit __allocation_guard(_AllocT __alloc, _Size __n) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __allocation_guard(_AllocT __alloc, _Size __n) : __alloc_(std::move(__alloc)), __n_(__n), __ptr_(allocator_traits<_Alloc>::allocate(__alloc_, __n_)) // initialization order is important {} - _LIBCPP_HIDE_FROM_ABI ~__allocation_guard() _NOEXCEPT { __destroy(); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI ~__allocation_guard() _NOEXCEPT { __destroy(); } - _LIBCPP_HIDE_FROM_ABI __allocation_guard(const __allocation_guard&) = delete; - _LIBCPP_HIDE_FROM_ABI __allocation_guard(__allocation_guard&& __other) _NOEXCEPT + __allocation_guard(const __allocation_guard&) = delete; + __allocation_guard& operator=(const __allocation_guard& __other) = delete; + + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __allocation_guard(__allocation_guard&& __other) _NOEXCEPT : __alloc_(std::move(__other.__alloc_)), __n_(__other.__n_), __ptr_(__other.__ptr_) { __other.__ptr_ = nullptr; } - _LIBCPP_HIDE_FROM_ABI __allocation_guard& operator=(const __allocation_guard& __other) = delete; - _LIBCPP_HIDE_FROM_ABI __allocation_guard& operator=(__allocation_guard&& __other) _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __allocation_guard& + operator=(__allocation_guard&& __other) _NOEXCEPT { if (std::addressof(__other) != this) { __destroy(); @@ -79,17 +81,17 @@ struct __allocation_guard { return *this; } - _LIBCPP_HIDE_FROM_ABI _Pointer + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI _Pointer __release_ptr() _NOEXCEPT { // not called __release() because it's a keyword in objective-c++ _Pointer __tmp = __ptr_; __ptr_ = nullptr; return __tmp; } - _LIBCPP_HIDE_FROM_ABI _Pointer __get() const _NOEXCEPT { return __ptr_; } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI _Pointer __get() const _NOEXCEPT { return __ptr_; } private: - _LIBCPP_HIDE_FROM_ABI void __destroy() _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __destroy() _NOEXCEPT { if (__ptr_ != nullptr) { allocator_traits<_Alloc>::deallocate(__alloc_, __ptr_, __n_); } diff --git a/libcxx/include/__memory/pointer_traits.h b/libcxx/include/__memory/pointer_traits.h index 4ba50898fb37d..879b387b9ad1f 100644 --- a/libcxx/include/__memory/pointer_traits.h +++ b/libcxx/include/__memory/pointer_traits.h @@ -245,8 +245,8 @@ inline _LIBCPP_HIDE_FROM_ABI constexpr auto to_address(_Tp* __p) noexcept { } template -inline _LIBCPP_HIDE_FROM_ABI constexpr auto -to_address(const _Pointer& __p) noexcept -> decltype(std::__to_address(__p)) { +inline _LIBCPP_HIDE_FROM_ABI constexpr auto to_address(const _Pointer& __p) noexcept + -> decltype(std::__to_address(__p)) { return std::__to_address(__p); } #endif @@ -302,6 +302,18 @@ concept __resettable_smart_pointer_with_args = requires(_Smart __s, _Pointer __p #endif +// This function ensures safe conversions between fancy pointers at compile-time, where we avoid casts from/to +// `__void_pointer` by obtaining the underlying raw pointer from the fancy pointer using `std::to_address`, +// then dereferencing it to retrieve the pointed-to object, and finally constructing the target fancy pointer +// to that object using the `std::pointer_traits<>::pinter_to` function. +template +_LIBCPP_CONSTEXPR_SINCE_CXX20 _LIBCPP_HIDE_FROM_ABI _PtrTo __static_fancy_pointer_cast(const _PtrFrom& __p) { + using __ptr_traits = pointer_traits<_PtrTo>; + using __element_type = typename __ptr_traits::element_type; + return __p ? __ptr_traits::pointer_to(*static_cast<__element_type*>(std::addressof(*__p))) + : static_cast<_PtrTo>(nullptr); +} + _LIBCPP_END_NAMESPACE_STD _LIBCPP_POP_MACROS diff --git a/libcxx/include/forward_list b/libcxx/include/forward_list index 5046de27a9da1..e9b2c860b89c4 100644 --- a/libcxx/include/forward_list +++ b/libcxx/include/forward_list @@ -295,8 +295,8 @@ struct __forward_node_traits { "the _LIBCPP_ABI_FORWARD_LIST_REMOVE_NODE_POINTER_UB macro to silence this diagnostic."); # endif - _LIBCPP_HIDE_FROM_ABI static __begin_node_pointer __as_iter_node(__node_pointer __p) { - return static_cast<__begin_node_pointer>(static_cast<__void_pointer>(__p)); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI static __begin_node_pointer __as_iter_node(__node_pointer __p) { + return std::__static_fancy_pointer_cast<__begin_node_pointer>(__p); } }; @@ -307,11 +307,11 @@ struct __forward_begin_node { pointer __next_; - _LIBCPP_HIDE_FROM_ABI __forward_begin_node() : __next_(nullptr) {} - _LIBCPP_HIDE_FROM_ABI explicit __forward_begin_node(pointer __n) : __next_(__n) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_begin_node() : __next_(nullptr) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_begin_node(pointer __n) : __next_(__n) {} - _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __next_as_begin() const { - return static_cast<__begin_node_pointer>(__next_); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __next_as_begin() const { + return std::__static_fancy_pointer_cast<__begin_node_pointer>(__next_); } }; @@ -335,7 +335,7 @@ private: }; public: - _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return __value_; } # else private: @@ -345,8 +345,8 @@ public: _LIBCPP_HIDE_FROM_ABI _Tp& __get_value() { return *std::__launder(reinterpret_cast<_Tp*>(&__buffer_)); } # endif - _LIBCPP_HIDE_FROM_ABI explicit __forward_list_node(_NodePtr __next) : _Base(__next) {} - _LIBCPP_HIDE_FROM_ABI ~__forward_list_node() {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_node(_NodePtr __next) : _Base(__next) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI ~__forward_list_node() {} }; template > @@ -357,24 +357,26 @@ class __forward_list_const_iterator; template class __forward_list_iterator { typedef __forward_node_traits<_NodePtr> __traits; + typedef typename __traits::__node_type __node_type; + typedef typename __traits::__begin_node __begin_node_type; typedef typename __traits::__node_pointer __node_pointer; typedef typename __traits::__begin_node_pointer __begin_node_pointer; typedef typename __traits::__void_pointer __void_pointer; __begin_node_pointer __ptr_; - _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const { - return static_cast<__begin_node_pointer>(static_cast<__void_pointer>(__ptr_)); - } - _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const { - return static_cast<__node_pointer>(static_cast<__void_pointer>(__ptr_)); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const { return __ptr_; } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const { + return std::__static_fancy_pointer_cast<__node_pointer>(__ptr_); } - _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(nullptr_t) _NOEXCEPT : __ptr_(nullptr) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(nullptr_t) _NOEXCEPT + : __ptr_(nullptr) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(__begin_node_pointer __p) _NOEXCEPT : __ptr_(__p) {} - _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(__node_pointer __p) _NOEXCEPT + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_iterator(__node_pointer __p) _NOEXCEPT : __ptr_(__traits::__as_iter_node(__p)) {} template @@ -389,27 +391,31 @@ public: typedef typename pointer_traits<__node_pointer>::difference_type difference_type; typedef __rebind_pointer_t<__node_pointer, value_type> pointer; - _LIBCPP_HIDE_FROM_ABI __forward_list_iterator() _NOEXCEPT : __ptr_(nullptr) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator() _NOEXCEPT : __ptr_(nullptr) {} - _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __get_unsafe_node_pointer()->__get_value(); } - _LIBCPP_HIDE_FROM_ABI pointer operator->() const { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference operator*() const { + return __get_unsafe_node_pointer()->__get_value(); + } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI pointer operator->() const { return pointer_traits::pointer_to(__get_unsafe_node_pointer()->__get_value()); } - _LIBCPP_HIDE_FROM_ABI __forward_list_iterator& operator++() { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator& operator++() { __ptr_ = __traits::__as_iter_node(__ptr_->__next_); return *this; } - _LIBCPP_HIDE_FROM_ABI __forward_list_iterator operator++(int) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_iterator operator++(int) { __forward_list_iterator __t(*this); ++(*this); return __t; } - friend _LIBCPP_HIDE_FROM_ABI bool operator==(const __forward_list_iterator& __x, const __forward_list_iterator& __y) { + friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool + operator==(const __forward_list_iterator& __x, const __forward_list_iterator& __y) { return __x.__ptr_ == __y.__ptr_; } - friend _LIBCPP_HIDE_FROM_ABI bool operator!=(const __forward_list_iterator& __x, const __forward_list_iterator& __y) { + friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool + operator!=(const __forward_list_iterator& __x, const __forward_list_iterator& __y) { return !(__x == __y); } }; @@ -421,23 +427,25 @@ class __forward_list_const_iterator { typedef __forward_node_traits<_NodePtr> __traits; typedef typename __traits::__node_type __node_type; + typedef typename __traits::__begin_node __begin_node_type; typedef typename __traits::__node_pointer __node_pointer; typedef typename __traits::__begin_node_pointer __begin_node_pointer; typedef typename __traits::__void_pointer __void_pointer; __begin_node_pointer __ptr_; - _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const { - return static_cast<__begin_node_pointer>(static_cast<__void_pointer>(__ptr_)); - } - _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const { - return static_cast<__node_pointer>(static_cast<__void_pointer>(__ptr_)); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __get_begin() const { return __ptr_; } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer __get_unsafe_node_pointer() const { + return std::__static_fancy_pointer_cast<__node_pointer>(__ptr_); } - _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(nullptr_t) _NOEXCEPT : __ptr_(nullptr) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(nullptr_t) _NOEXCEPT + : __ptr_(nullptr) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(__begin_node_pointer __p) _NOEXCEPT : __ptr_(__p) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_const_iterator(__node_pointer __p) _NOEXCEPT : __ptr_(__traits::__as_iter_node(__p)) {} @@ -451,30 +459,32 @@ public: typedef typename pointer_traits<__node_pointer>::difference_type difference_type; typedef __rebind_pointer_t<__node_pointer, const value_type> pointer; - _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator() _NOEXCEPT : __ptr_(nullptr) {} - _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator(__forward_list_iterator<__node_pointer> __p) _NOEXCEPT - : __ptr_(__p.__ptr_) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator() _NOEXCEPT : __ptr_(nullptr) {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + __forward_list_const_iterator(__forward_list_iterator<__node_pointer> __p) _NOEXCEPT : __ptr_(__p.__ptr_) {} - _LIBCPP_HIDE_FROM_ABI reference operator*() const { return __get_unsafe_node_pointer()->__get_value(); } - _LIBCPP_HIDE_FROM_ABI pointer operator->() const { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference operator*() const { + return __get_unsafe_node_pointer()->__get_value(); + } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI pointer operator->() const { return pointer_traits::pointer_to(__get_unsafe_node_pointer()->__get_value()); } - _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator& operator++() { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator& operator++() { __ptr_ = __traits::__as_iter_node(__ptr_->__next_); return *this; } - _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator operator++(int) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_const_iterator operator++(int) { __forward_list_const_iterator __t(*this); ++(*this); return __t; } - friend _LIBCPP_HIDE_FROM_ABI bool + friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool operator==(const __forward_list_const_iterator& __x, const __forward_list_const_iterator& __y) { return __x.__ptr_ == __y.__ptr_; } - friend _LIBCPP_HIDE_FROM_ABI bool + friend _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool operator!=(const __forward_list_const_iterator& __x, const __forward_list_const_iterator& __y) { return !(__x == __y); } @@ -498,48 +508,53 @@ protected: _LIBCPP_COMPRESSED_PAIR(__begin_node, __before_begin_, __node_allocator, __alloc_); - _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() _NOEXCEPT { return pointer_traits<__begin_node_pointer>::pointer_to(__before_begin_); } - _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() const _NOEXCEPT { - return pointer_traits<__begin_node_pointer>::pointer_to(const_cast<__begin_node&>(__before_begin_)); + + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __begin_node_pointer __before_begin() const _NOEXCEPT { + return pointer_traits<__begin_node_pointer>::pointer_to( + *const_cast<__begin_node*>(std::addressof(__before_begin_))); } typedef __forward_list_iterator<__node_pointer> iterator; typedef __forward_list_const_iterator<__node_pointer> const_iterator; - _LIBCPP_HIDE_FROM_ABI __forward_list_base() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_base() + _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) : __before_begin_(__begin_node()) {} - _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const allocator_type& __a) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const allocator_type& __a) : __before_begin_(__begin_node()), __alloc_(__node_allocator(__a)) {} - _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const __node_allocator& __a) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit __forward_list_base(const __node_allocator& __a) : __before_begin_(__begin_node()), __alloc_(__a) {} public: # ifndef _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __forward_list_base(__forward_list_base&& __x) noexcept(is_nothrow_move_constructible<__node_allocator>::value); - _LIBCPP_HIDE_FROM_ABI __forward_list_base(__forward_list_base&& __x, const allocator_type& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + __forward_list_base(__forward_list_base&& __x, const allocator_type& __a); # endif // _LIBCPP_CXX03_LANG __forward_list_base(const __forward_list_base&) = delete; __forward_list_base& operator=(const __forward_list_base&) = delete; - _LIBCPP_HIDE_FROM_ABI ~__forward_list_base(); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI ~__forward_list_base(); protected: - _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base& __x) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base& __x) { __copy_assign_alloc(__x, integral_constant()); } - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x) _NOEXCEPT_(!__node_traits::propagate_on_container_move_assignment::value || is_nothrow_move_assignable<__node_allocator>::value) { __move_assign_alloc(__x, integral_constant()); } template - _LIBCPP_HIDE_FROM_ABI __node_pointer __create_node(__node_pointer __next, _Args&&... __args) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __node_pointer + __create_node(__node_pointer __next, _Args&&... __args) { __allocation_guard<__node_allocator> __guard(__alloc_, 1); // Begin the lifetime of the node itself. Note that this doesn't begin the lifetime of the value // held inside the node, since we need to use the allocator's construct() method for that. @@ -554,7 +569,7 @@ protected: return __guard.__release_ptr(); } - _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __delete_node(__node_pointer __node) { // For the same reason as above, we use the allocator's destroy() method for the value_type, // but not for the node itself. __node_traits::destroy(__alloc_, std::addressof(__node->__get_value())); @@ -563,7 +578,7 @@ protected: } public: - _LIBCPP_HIDE_FROM_ABI void swap(__forward_list_base& __x) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void swap(__forward_list_base& __x) # if _LIBCPP_STD_VER >= 14 _NOEXCEPT; # else @@ -571,18 +586,21 @@ public: # endif protected: - _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT; + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT; private: - _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base&, false_type) {} - _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base& __x, true_type) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __copy_assign_alloc(const __forward_list_base&, false_type) { + } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void + __copy_assign_alloc(const __forward_list_base& __x, true_type) { if (__alloc_ != __x.__alloc_) clear(); __alloc_ = __x.__alloc_; } - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base&, false_type) _NOEXCEPT {} - _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x, true_type) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void + __move_assign_alloc(__forward_list_base&, false_type) _NOEXCEPT {} + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign_alloc(__forward_list_base& __x, true_type) _NOEXCEPT_(is_nothrow_move_assignable<__node_allocator>::value) { __alloc_ = std::move(__x.__alloc_); } @@ -591,14 +609,15 @@ private: # ifndef _LIBCPP_CXX03_LANG template -inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(__forward_list_base&& __x) noexcept( - is_nothrow_move_constructible<__node_allocator>::value) +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline __forward_list_base<_Tp, _Alloc>::__forward_list_base( + __forward_list_base&& __x) noexcept(is_nothrow_move_constructible<__node_allocator>::value) : __before_begin_(std::move(__x.__before_begin_)), __alloc_(std::move(__x.__alloc_)) { __x.__before_begin()->__next_ = nullptr; } template -inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(__forward_list_base&& __x, const allocator_type& __a) +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline __forward_list_base<_Tp, _Alloc>::__forward_list_base( + __forward_list_base&& __x, const allocator_type& __a) : __before_begin_(__begin_node()), __alloc_(__node_allocator(__a)) { if (__alloc_ == __x.__alloc_) { __before_begin()->__next_ = __x.__before_begin()->__next_; @@ -609,12 +628,12 @@ inline __forward_list_base<_Tp, _Alloc>::__forward_list_base(__forward_list_base # endif // _LIBCPP_CXX03_LANG template -__forward_list_base<_Tp, _Alloc>::~__forward_list_base() { +_LIBCPP_CONSTEXPR_SINCE_CXX26 __forward_list_base<_Tp, _Alloc>::~__forward_list_base() { clear(); } template -inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x) +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x) # if _LIBCPP_STD_VER >= 14 _NOEXCEPT # else @@ -627,7 +646,7 @@ inline void __forward_list_base<_Tp, _Alloc>::swap(__forward_list_base& __x) } template -void __forward_list_base<_Tp, _Alloc>::clear() _NOEXCEPT { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void __forward_list_base<_Tp, _Alloc>::clear() _NOEXCEPT { for (__node_pointer __p = __before_begin()->__next_; __p != nullptr;) { __node_pointer __next = __p->__next_; __delete_node(__p); @@ -672,105 +691,123 @@ public: typedef void __remove_return_type; # endif - _LIBCPP_HIDE_FROM_ABI forward_list() _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) { - } // = default; - _LIBCPP_HIDE_FROM_ABI explicit forward_list(const allocator_type& __a); - _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list() + _NOEXCEPT_(is_nothrow_default_constructible<__node_allocator>::value) {} // = default; + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit forward_list(const allocator_type& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n); # if _LIBCPP_STD_VER >= 14 - _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n, const allocator_type& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI explicit forward_list(size_type __n, const allocator_type& __a); # endif - _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v); template <__enable_if_t<__is_allocator<_Alloc>::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI forward_list(size_type __n, const value_type& __v, const allocator_type& __a) : __base(__a) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + forward_list(size_type __n, const value_type& __v, const allocator_type& __a) + : __base(__a) { insert_after(cbefore_begin(), __n, __v); } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI forward_list(_InputIterator __f, _InputIterator __l); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(_InputIterator __f, _InputIterator __l); template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI forward_list(_InputIterator __f, _InputIterator __l, const allocator_type& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + forward_list(_InputIterator __f, _InputIterator __l, const allocator_type& __a); # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange<_Tp> _Range> - _LIBCPP_HIDE_FROM_ABI forward_list(from_range_t, _Range&& __range, const allocator_type& __a = allocator_type()) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + forward_list(from_range_t, _Range&& __range, const allocator_type& __a = allocator_type()) : __base(__a) { prepend_range(std::forward<_Range>(__range)); } # endif - _LIBCPP_HIDE_FROM_ABI forward_list(const forward_list& __x); - _LIBCPP_HIDE_FROM_ABI forward_list(const forward_list& __x, const __type_identity_t& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(const forward_list& __x); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + forward_list(const forward_list& __x, const __type_identity_t& __a); - _LIBCPP_HIDE_FROM_ABI forward_list& operator=(const forward_list& __x); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list& operator=(const forward_list& __x); # ifndef _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI forward_list(forward_list&& __x) noexcept(is_nothrow_move_constructible<__base>::value) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + forward_list(forward_list&& __x) noexcept(is_nothrow_move_constructible<__base>::value) : __base(std::move(__x)) {} - _LIBCPP_HIDE_FROM_ABI forward_list(forward_list&& __x, const __type_identity_t& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + forward_list(forward_list&& __x, const __type_identity_t& __a); - _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list __il); - _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list __il, const allocator_type& __a); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list(initializer_list __il); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI + forward_list(initializer_list __il, const allocator_type& __a); - _LIBCPP_HIDE_FROM_ABI forward_list& operator=(forward_list&& __x) noexcept( + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list& operator=(forward_list&& __x) noexcept( (__node_traits::propagate_on_container_move_assignment::value && is_nothrow_move_assignable::value) || allocator_traits::is_always_equal::value); - _LIBCPP_HIDE_FROM_ABI forward_list& operator=(initializer_list __il); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI forward_list& operator=(initializer_list __il); - _LIBCPP_HIDE_FROM_ABI void assign(initializer_list __il); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(initializer_list __il); # endif // _LIBCPP_CXX03_LANG // ~forward_list() = default; template ::value, int> = 0> - void _LIBCPP_HIDE_FROM_ABI assign(_InputIterator __f, _InputIterator __l); + _LIBCPP_CONSTEXPR_SINCE_CXX26 void _LIBCPP_HIDE_FROM_ABI assign(_InputIterator __f, _InputIterator __l); # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange<_Tp> _Range> - _LIBCPP_HIDE_FROM_ABI void assign_range(_Range&& __range) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign_range(_Range&& __range) { __assign_with_sentinel(ranges::begin(__range), ranges::end(__range)); } # endif - _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void assign(size_type __n, const value_type& __v); - _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { return allocator_type(this->__alloc_); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI allocator_type get_allocator() const _NOEXCEPT { + return allocator_type(this->__alloc_); + } - _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { return iterator(__base::__before_begin()->__next_); } - _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator begin() _NOEXCEPT { + return iterator(__base::__before_begin()->__next_); + } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator begin() const _NOEXCEPT { return const_iterator(__base::__before_begin()->__next_); } - _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(nullptr); } - _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { return const_iterator(nullptr); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator end() _NOEXCEPT { return iterator(nullptr); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator end() const _NOEXCEPT { + return const_iterator(nullptr); + } - _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbegin() const _NOEXCEPT { return const_iterator(__base::__before_begin()->__next_); } - _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { return const_iterator(nullptr); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cend() const _NOEXCEPT { + return const_iterator(nullptr); + } - _LIBCPP_HIDE_FROM_ABI iterator before_begin() _NOEXCEPT { return iterator(__base::__before_begin()); } - _LIBCPP_HIDE_FROM_ABI const_iterator before_begin() const _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator before_begin() _NOEXCEPT { + return iterator(__base::__before_begin()); + } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator before_begin() const _NOEXCEPT { return const_iterator(__base::__before_begin()); } - _LIBCPP_HIDE_FROM_ABI const_iterator cbefore_begin() const _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_iterator cbefore_begin() const _NOEXCEPT { return const_iterator(__base::__before_begin()); } - [[__nodiscard__]] _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { + [[__nodiscard__]] _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool empty() const _NOEXCEPT { return __base::__before_begin()->__next_ == nullptr; } - _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI size_type max_size() const _NOEXCEPT { return std::min(__node_traits::max_size(this->__alloc_), numeric_limits::max()); } - _LIBCPP_HIDE_FROM_ABI reference front() { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference front() { _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::front called on an empty list"); return __base::__before_begin()->__next_->__get_value(); } - _LIBCPP_HIDE_FROM_ABI const_reference front() const { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI const_reference front() const { _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::front called on an empty list"); return __base::__before_begin()->__next_->__get_value(); } @@ -778,54 +815,59 @@ public: # ifndef _LIBCPP_CXX03_LANG # if _LIBCPP_STD_VER >= 17 template - _LIBCPP_HIDE_FROM_ABI reference emplace_front(_Args&&... __args); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI reference emplace_front(_Args&&... __args); # else template - _LIBCPP_HIDE_FROM_ABI void emplace_front(_Args&&... __args); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void emplace_front(_Args&&... __args); # endif - _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void push_front(value_type&& __v); # endif // _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI void push_front(const value_type& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void push_front(const value_type& __v); # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange<_Tp> _Range> - _LIBCPP_HIDE_FROM_ABI void prepend_range(_Range&& __range) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void prepend_range(_Range&& __range) { insert_range_after(cbefore_begin(), std::forward<_Range>(__range)); } # endif - _LIBCPP_HIDE_FROM_ABI void pop_front(); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void pop_front(); # ifndef _LIBCPP_CXX03_LANG template - _LIBCPP_HIDE_FROM_ABI iterator emplace_after(const_iterator __p, _Args&&... __args); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator emplace_after(const_iterator __p, _Args&&... __args); - _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, value_type&& __v); - _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, initializer_list __il) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, value_type&& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator + insert_after(const_iterator __p, initializer_list __il) { return insert_after(__p, __il.begin(), __il.end()); } # endif // _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, const value_type& __v); - _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, size_type __n, const value_type& __v) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, const value_type& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator + insert_after(const_iterator __p, size_type __n, const value_type& __v) { return __insert_after(__p, __n, __v); } template ::value, int> = 0> - _LIBCPP_HIDE_FROM_ABI iterator insert_after(const_iterator __p, _InputIterator __f, _InputIterator __l); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator + insert_after(const_iterator __p, _InputIterator __f, _InputIterator __l); # if _LIBCPP_STD_VER >= 23 template <_ContainerCompatibleRange<_Tp> _Range> - _LIBCPP_HIDE_FROM_ABI iterator insert_range_after(const_iterator __position, _Range&& __range) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator + insert_range_after(const_iterator __position, _Range&& __range) { return __insert_after_with_sentinel(__position, ranges::begin(__range), ranges::end(__range)); } # endif template - _LIBCPP_HIDE_FROM_ABI iterator __insert_after_with_sentinel(const_iterator __p, _InputIterator __f, _Sentinel __l); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator + __insert_after_with_sentinel(const_iterator __p, _InputIterator __f, _Sentinel __l); - _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __p); - _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __f, const_iterator __l); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __p); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator erase_after(const_iterator __f, const_iterator __l); - _LIBCPP_HIDE_FROM_ABI void swap(forward_list& __x) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void swap(forward_list& __x) # if _LIBCPP_STD_VER >= 14 _NOEXCEPT # else @@ -835,58 +877,63 @@ public: __base::swap(__x); } - _LIBCPP_HIDE_FROM_ABI void resize(size_type __n); - _LIBCPP_HIDE_FROM_ABI void resize(size_type __n, const value_type& __v); - _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __base::clear(); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void resize(size_type __n); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void resize(size_type __n, const value_type& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void clear() _NOEXCEPT { __base::clear(); } - _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x); - _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x, const_iterator __i); - _LIBCPP_HIDE_FROM_ABI void + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void + splice_after(const_iterator __p, forward_list&& __x, const_iterator __i); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list&& __x, const_iterator __f, const_iterator __l); - _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list& __x); - _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list& __x, const_iterator __i); - _LIBCPP_HIDE_FROM_ABI void + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list& __x); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void + splice_after(const_iterator __p, forward_list& __x, const_iterator __i); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void splice_after(const_iterator __p, forward_list& __x, const_iterator __f, const_iterator __l); - _LIBCPP_HIDE_FROM_ABI __remove_return_type remove(const value_type& __v); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type remove(const value_type& __v); template - _LIBCPP_HIDE_FROM_ABI __remove_return_type remove_if(_Predicate __pred); - _LIBCPP_HIDE_FROM_ABI __remove_return_type unique() { return unique(__equal_to()); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type remove_if(_Predicate __pred); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type unique() { return unique(__equal_to()); } template - _LIBCPP_HIDE_FROM_ABI __remove_return_type unique(_BinaryPredicate __binary_pred); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __remove_return_type unique(_BinaryPredicate __binary_pred); # ifndef _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x) { merge(__x, __less<>()); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x) { merge(__x, __less<>()); } template - _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x, _Compare __comp) { + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list&& __x, _Compare __comp) { merge(__x, std::move(__comp)); } # endif // _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x) { merge(__x, __less<>()); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x) { merge(__x, __less<>()); } template - _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x, _Compare __comp); - _LIBCPP_HIDE_FROM_ABI void sort() { sort(__less<>()); } + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void merge(forward_list& __x, _Compare __comp); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void sort() { sort(__less<>()); } template - _LIBCPP_HIDE_FROM_ABI void sort(_Compare __comp); - _LIBCPP_HIDE_FROM_ABI void reverse() _NOEXCEPT; + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void sort(_Compare __comp); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void reverse() _NOEXCEPT; private: # ifndef _LIBCPP_CXX03_LANG - _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, true_type) + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, true_type) _NOEXCEPT_(is_nothrow_move_assignable::value); - _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, false_type); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __move_assign(forward_list& __x, false_type); # endif // _LIBCPP_CXX03_LANG template - _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iter __f, _Sent __l); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void __assign_with_sentinel(_Iter __f, _Sent __l); template - _LIBCPP_HIDE_FROM_ABI iterator __insert_after(const_iterator __p, size_type __n, _Args&&... __args); + _LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI iterator + __insert_after(const_iterator __p, size_type __n, _Args&&... __args); template - static _LIBCPP_HIDE_FROM_ABI __node_pointer __merge(__node_pointer __f1, __node_pointer __f2, _Compare& __comp); + _LIBCPP_CONSTEXPR_SINCE_CXX26 static _LIBCPP_HIDE_FROM_ABI __node_pointer + __merge(__node_pointer __f1, __node_pointer __f2, _Compare& __comp); // TODO: Make this _LIBCPP_HIDE_FROM_ABI template - static _LIBCPP_HIDDEN __node_pointer __sort(__node_pointer __f, difference_type __sz, _Compare& __comp); + _LIBCPP_CONSTEXPR_SINCE_CXX26 static _LIBCPP_HIDDEN __node_pointer + __sort(__node_pointer __f, difference_type __sz, _Compare& __comp); }; # if _LIBCPP_STD_VER >= 17 @@ -911,10 +958,10 @@ forward_list(from_range_t, _Range&&, _Alloc = _Alloc()) -> forward_list -inline forward_list<_Tp, _Alloc>::forward_list(const allocator_type& __a) : __base(__a) {} +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline forward_list<_Tp, _Alloc>::forward_list(const allocator_type& __a) : __base(__a) {} template -forward_list<_Tp, _Alloc>::forward_list(size_type __n) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(size_type __n) { if (__n > 0) { for (__begin_node_pointer __p = __base::__before_begin(); __n > 0; --__n, __p = __p->__next_as_begin()) { __p->__next_ = this->__create_node(/* next = */ nullptr); @@ -924,7 +971,8 @@ forward_list<_Tp, _Alloc>::forward_list(size_type __n) { # if _LIBCPP_STD_VER >= 14 template -forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __base_alloc) : __base(__base_alloc) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __base_alloc) + : __base(__base_alloc) { if (__n > 0) { for (__begin_node_pointer __p = __base::__before_begin(); __n > 0; --__n, __p = __p->__next_as_begin()) { __p->__next_ = this->__create_node(/* next = */ nullptr); @@ -934,37 +982,39 @@ forward_list<_Tp, _Alloc>::forward_list(size_type __n, const allocator_type& __b # endif template -forward_list<_Tp, _Alloc>::forward_list(size_type __n, const value_type& __v) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(size_type __n, const value_type& __v) { insert_after(cbefore_begin(), __n, __v); } template template ::value, int> > -forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l) { insert_after(cbefore_begin(), __f, __l); } template template ::value, int> > +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(_InputIterator __f, _InputIterator __l, const allocator_type& __a) : __base(__a) { insert_after(cbefore_begin(), __f, __l); } template -forward_list<_Tp, _Alloc>::forward_list(const forward_list& __x) +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(const forward_list& __x) : __base(__node_traits::select_on_container_copy_construction(__x.__alloc_)) { insert_after(cbefore_begin(), __x.begin(), __x.end()); } template +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(const forward_list& __x, const __type_identity_t& __a) : __base(__a) { insert_after(cbefore_begin(), __x.begin(), __x.end()); } template -forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(const forward_list& __x) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(const forward_list& __x) { if (this != std::addressof(__x)) { __base::__copy_assign_alloc(__x); assign(__x.begin(), __x.end()); @@ -974,6 +1024,7 @@ forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(const forward_li # ifndef _LIBCPP_CXX03_LANG template +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(forward_list&& __x, const __type_identity_t& __a) : __base(std::move(__x), __a) { if (this->__alloc_ != __x.__alloc_) { @@ -983,17 +1034,19 @@ forward_list<_Tp, _Alloc>::forward_list(forward_list&& __x, const __type_identit } template -forward_list<_Tp, _Alloc>::forward_list(initializer_list __il) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 forward_list<_Tp, _Alloc>::forward_list(initializer_list __il) { insert_after(cbefore_begin(), __il.begin(), __il.end()); } template -forward_list<_Tp, _Alloc>::forward_list(initializer_list __il, const allocator_type& __a) : __base(__a) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 +forward_list<_Tp, _Alloc>::forward_list(initializer_list __il, const allocator_type& __a) + : __base(__a) { insert_after(cbefore_begin(), __il.begin(), __il.end()); } template -void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, true_type) +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, true_type) _NOEXCEPT_(is_nothrow_move_assignable::value) { clear(); __base::__move_assign_alloc(__x); @@ -1002,7 +1055,7 @@ void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, true_type) } template -void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, false_type) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, false_type) { if (this->__alloc_ == __x.__alloc_) __move_assign(__x, true_type()); else { @@ -1012,7 +1065,8 @@ void forward_list<_Tp, _Alloc>::__move_assign(forward_list& __x, false_type) { } template -inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(forward_list&& __x) noexcept( +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline forward_list<_Tp, _Alloc>& +forward_list<_Tp, _Alloc>::operator=(forward_list&& __x) noexcept( (__node_traits::propagate_on_container_move_assignment::value && is_nothrow_move_assignable::value) || allocator_traits::is_always_equal::value) { @@ -1021,7 +1075,8 @@ inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(forward_l } template -inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(initializer_list __il) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline forward_list<_Tp, _Alloc>& +forward_list<_Tp, _Alloc>::operator=(initializer_list __il) { assign(__il.begin(), __il.end()); return *this; } @@ -1030,13 +1085,14 @@ inline forward_list<_Tp, _Alloc>& forward_list<_Tp, _Alloc>::operator=(initializ template template ::value, int> > -void forward_list<_Tp, _Alloc>::assign(_InputIterator __f, _InputIterator __l) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::assign(_InputIterator __f, _InputIterator __l) { __assign_with_sentinel(__f, __l); } template template -_LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::__assign_with_sentinel(_Iter __f, _Sent __l) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI void +forward_list<_Tp, _Alloc>::__assign_with_sentinel(_Iter __f, _Sent __l) { iterator __i = before_begin(); iterator __j = std::next(__i); iterator __e = end(); @@ -1049,7 +1105,7 @@ _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::__assign_with_sentinel(_It } template -void forward_list<_Tp, _Alloc>::assign(size_type __n, const value_type& __v) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::assign(size_type __n, const value_type& __v) { iterator __i = before_begin(); iterator __j = std::next(__i); iterator __e = end(); @@ -1064,18 +1120,19 @@ void forward_list<_Tp, _Alloc>::assign(size_type __n, const value_type& __v) { # ifndef _LIBCPP_CXX03_LANG template -inline void forward_list<_Tp, _Alloc>::assign(initializer_list __il) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void forward_list<_Tp, _Alloc>::assign(initializer_list __il) { assign(__il.begin(), __il.end()); } template template +_LIBCPP_CONSTEXPR_SINCE_CXX26 # if _LIBCPP_STD_VER >= 17 -typename forward_list<_Tp, _Alloc>::reference + typename forward_list<_Tp, _Alloc>::reference # else -void + void # endif -forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args) { + forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args) { __base::__before_begin()->__next_ = this->__create_node(/* next = */ __base::__before_begin()->__next_, std::forward<_Args>(__args)...); # if _LIBCPP_STD_VER >= 17 @@ -1084,7 +1141,7 @@ forward_list<_Tp, _Alloc>::emplace_front(_Args&&... __args) { } template -void forward_list<_Tp, _Alloc>::push_front(value_type&& __v) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::push_front(value_type&& __v) { __base::__before_begin()->__next_ = this->__create_node(/* next = */ __base::__before_begin()->__next_, std::move(__v)); } @@ -1092,12 +1149,12 @@ void forward_list<_Tp, _Alloc>::push_front(value_type&& __v) { # endif // _LIBCPP_CXX03_LANG template -void forward_list<_Tp, _Alloc>::push_front(const value_type& __v) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::push_front(const value_type& __v) { __base::__before_begin()->__next_ = this->__create_node(/* next = */ __base::__before_begin()->__next_, __v); } template -void forward_list<_Tp, _Alloc>::pop_front() { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::pop_front() { _LIBCPP_ASSERT_NON_NULL(!empty(), "forward_list::pop_front called on an empty list"); __node_pointer __p = __base::__before_begin()->__next_; __base::__before_begin()->__next_ = __p->__next_; @@ -1108,7 +1165,7 @@ void forward_list<_Tp, _Alloc>::pop_front() { template template -typename forward_list<_Tp, _Alloc>::iterator +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::emplace_after(const_iterator __p, _Args&&... __args) { __begin_node_pointer const __r = __p.__get_begin(); __r->__next_ = this->__create_node(/* next = */ __r->__next_, std::forward<_Args>(__args)...); @@ -1116,7 +1173,7 @@ forward_list<_Tp, _Alloc>::emplace_after(const_iterator __p, _Args&&... __args) } template -typename forward_list<_Tp, _Alloc>::iterator +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, value_type&& __v) { __begin_node_pointer const __r = __p.__get_begin(); __r->__next_ = this->__create_node(/* next = */ __r->__next_, std::move(__v)); @@ -1126,7 +1183,7 @@ forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, value_type&& __v) { # endif // _LIBCPP_CXX03_LANG template -typename forward_list<_Tp, _Alloc>::iterator +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, const value_type& __v) { __begin_node_pointer const __r = __p.__get_begin(); __r->__next_ = this->__create_node(/* next = */ __r->__next_, __v); @@ -1135,7 +1192,7 @@ forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, const value_type& __ template template -typename forward_list<_Tp, _Alloc>::iterator +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::__insert_after(const_iterator __p, size_type __n, _Args&&... __args) { __begin_node_pointer __r = __p.__get_begin(); if (__n > 0) { @@ -1159,21 +1216,21 @@ forward_list<_Tp, _Alloc>::__insert_after(const_iterator __p, size_type __n, _Ar # endif // _LIBCPP_HAS_EXCEPTIONS __last->__next_ = __r->__next_; __r->__next_ = __first; - __r = static_cast<__begin_node_pointer>(__last); + __r = __forward_node_traits<__node_pointer>::__as_iter_node(__last); } return iterator(__r); } template template ::value, int> > -typename forward_list<_Tp, _Alloc>::iterator +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::insert_after(const_iterator __p, _InputIterator __f, _InputIterator __l) { return __insert_after_with_sentinel(__p, std::move(__f), std::move(__l)); } template template -_LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Alloc>::iterator +_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::__insert_after_with_sentinel(const_iterator __p, _InputIterator __f, _Sentinel __l) { __begin_node_pointer __r = __p.__get_begin(); @@ -1200,14 +1257,15 @@ forward_list<_Tp, _Alloc>::__insert_after_with_sentinel(const_iterator __p, _Inp __last->__next_ = __r->__next_; __r->__next_ = __first; - __r = static_cast<__begin_node_pointer>(__last); + __r = __forward_node_traits<__node_pointer>::__as_iter_node(__last); } return iterator(__r); } template -typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::erase_after(const_iterator __f) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator +forward_list<_Tp, _Alloc>::erase_after(const_iterator __f) { __begin_node_pointer __p = __f.__get_begin(); __node_pointer __n = __p->__next_; __p->__next_ = __n->__next_; @@ -1216,7 +1274,7 @@ typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::erase_af } template -typename forward_list<_Tp, _Alloc>::iterator +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::iterator forward_list<_Tp, _Alloc>::erase_after(const_iterator __f, const_iterator __l) { __node_pointer __e = __l.__get_unsafe_node_pointer(); if (__f != __l) { @@ -1236,7 +1294,7 @@ forward_list<_Tp, _Alloc>::erase_after(const_iterator __f, const_iterator __l) { } template -void forward_list<_Tp, _Alloc>::resize(size_type __n) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::resize(size_type __n) { size_type __sz = 0; iterator __p = before_begin(); iterator __i = begin(); @@ -1250,7 +1308,7 @@ void forward_list<_Tp, _Alloc>::resize(size_type __n) { } template -void forward_list<_Tp, _Alloc>::resize(size_type __n, const value_type& __v) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::resize(size_type __n, const value_type& __v) { size_type __sz = 0; iterator __p = before_begin(); iterator __i = begin(); @@ -1264,7 +1322,7 @@ void forward_list<_Tp, _Alloc>::resize(size_type __n, const value_type& __v) { } template -void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& __x) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& __x) { if (!__x.empty()) { if (__p.__get_begin()->__next_ != nullptr) { const_iterator __lm1 = __x.before_begin(); @@ -1278,7 +1336,8 @@ void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& _ } template -void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& /*__other*/, const_iterator __i) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void +forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& /*__other*/, const_iterator __i) { const_iterator __lm1 = std::next(__i); if (__p != __i && __p != __lm1) { __i.__get_begin()->__next_ = __lm1.__get_begin()->__next_; @@ -1288,7 +1347,7 @@ void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list& / } template -void forward_list<_Tp, _Alloc>::splice_after( +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::splice_after( const_iterator __p, forward_list& /*__other*/, const_iterator __f, const_iterator __l) { if (__f != __l && __p != __f) { const_iterator __lm1 = __f; @@ -1303,24 +1362,26 @@ void forward_list<_Tp, _Alloc>::splice_after( } template -inline _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list&& __x) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void +forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list&& __x) { splice_after(__p, __x); } template -inline _LIBCPP_HIDE_FROM_ABI void +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::splice_after(const_iterator __p, forward_list&& __x, const_iterator __i) { splice_after(__p, __x, __i); } template -inline _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::splice_after( +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void forward_list<_Tp, _Alloc>::splice_after( const_iterator __p, forward_list&& __x, const_iterator __f, const_iterator __l) { splice_after(__p, __x, __f, __l); } template -typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Alloc>::remove(const value_type& __v) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__remove_return_type +forward_list<_Tp, _Alloc>::remove(const value_type& __v) { forward_list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing typename forward_list<_Tp, _Alloc>::size_type __count_removed = 0; const iterator __e = end(); @@ -1343,7 +1404,8 @@ typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Allo template template -typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Alloc>::remove_if(_Predicate __pred) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__remove_return_type +forward_list<_Tp, _Alloc>::remove_if(_Predicate __pred) { forward_list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing typename forward_list<_Tp, _Alloc>::size_type __count_removed = 0; const iterator __e = end(); @@ -1366,7 +1428,7 @@ typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Allo template template -typename forward_list<_Tp, _Alloc>::__remove_return_type +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__remove_return_type forward_list<_Tp, _Alloc>::unique(_BinaryPredicate __binary_pred) { forward_list<_Tp, _Alloc> __deleted_nodes(get_allocator()); // collect the nodes we're removing typename forward_list<_Tp, _Alloc>::size_type __count_removed = 0; @@ -1384,7 +1446,7 @@ forward_list<_Tp, _Alloc>::unique(_BinaryPredicate __binary_pred) { template template -void forward_list<_Tp, _Alloc>::merge(forward_list& __x, _Compare __comp) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::merge(forward_list& __x, _Compare __comp) { if (this != std::addressof(__x)) { __base::__before_begin()->__next_ = __merge(__base::__before_begin()->__next_, __x.__before_begin()->__next_, __comp); @@ -1394,7 +1456,7 @@ void forward_list<_Tp, _Alloc>::merge(forward_list& __x, _Compare __comp) { template template -typename forward_list<_Tp, _Alloc>::__node_pointer +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__node_pointer forward_list<_Tp, _Alloc>::__merge(__node_pointer __f1, __node_pointer __f2, _Compare& __comp) { if (__f1 == nullptr) return __f2; @@ -1431,13 +1493,13 @@ forward_list<_Tp, _Alloc>::__merge(__node_pointer __f1, __node_pointer __f2, _Co template template -inline void forward_list<_Tp, _Alloc>::sort(_Compare __comp) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline void forward_list<_Tp, _Alloc>::sort(_Compare __comp) { __base::__before_begin()->__next_ = __sort(__base::__before_begin()->__next_, std::distance(begin(), end()), __comp); } template template -typename forward_list<_Tp, _Alloc>::__node_pointer +_LIBCPP_CONSTEXPR_SINCE_CXX26 typename forward_list<_Tp, _Alloc>::__node_pointer forward_list<_Tp, _Alloc>::__sort(__node_pointer __f1, difference_type __sz, _Compare& __comp) { switch (__sz) { case 0: @@ -1461,7 +1523,7 @@ forward_list<_Tp, _Alloc>::__sort(__node_pointer __f1, difference_type __sz, _Co } template -void forward_list<_Tp, _Alloc>::reverse() _NOEXCEPT { +_LIBCPP_CONSTEXPR_SINCE_CXX26 void forward_list<_Tp, _Alloc>::reverse() _NOEXCEPT { __node_pointer __p = __base::__before_begin()->__next_; if (__p != nullptr) { __node_pointer __f = __p->__next_; @@ -1477,7 +1539,8 @@ void forward_list<_Tp, _Alloc>::reverse() _NOEXCEPT { } template -_LIBCPP_HIDE_FROM_ABI bool operator==(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI bool +operator==(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) { typedef forward_list<_Tp, _Alloc> _Cp; typedef typename _Cp::const_iterator _Ip; _Ip __ix = __x.begin(); @@ -1493,31 +1556,31 @@ _LIBCPP_HIDE_FROM_ABI bool operator==(const forward_list<_Tp, _Alloc>& __x, cons # if _LIBCPP_STD_VER <= 17 template -inline _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool operator!=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) { return !(__x == __y); } template -inline _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool operator<(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) { return std::lexicographical_compare(__x.begin(), __x.end(), __y.begin(), __y.end()); } template -inline _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool operator>(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) { return __y < __x; } template -inline _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool operator>=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) { return !(__x < __y); } template -inline _LIBCPP_HIDE_FROM_ABI bool +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI bool operator<=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc>& __y) { return !(__y < __x); } @@ -1525,7 +1588,7 @@ operator<=(const forward_list<_Tp, _Alloc>& __x, const forward_list<_Tp, _Alloc> # else // #if _LIBCPP_STD_VER <= 17 template -_LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp> +_LIBCPP_CONSTEXPR_SINCE_CXX26 _LIBCPP_HIDE_FROM_ABI __synth_three_way_result<_Tp> operator<=>(const forward_list<_Tp, _Allocator>& __x, const forward_list<_Tp, _Allocator>& __y) { return std::lexicographical_compare_three_way(__x.begin(), __x.end(), __y.begin(), __y.end(), std::__synth_three_way); } @@ -1533,20 +1596,20 @@ operator<=>(const forward_list<_Tp, _Allocator>& __x, const forward_list<_Tp, _A # endif // #if _LIBCPP_STD_VER <= 17 template -inline _LIBCPP_HIDE_FROM_ABI void swap(forward_list<_Tp, _Alloc>& __x, forward_list<_Tp, _Alloc>& __y) - _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI void +swap(forward_list<_Tp, _Alloc>& __x, forward_list<_Tp, _Alloc>& __y) _NOEXCEPT_(_NOEXCEPT_(__x.swap(__y))) { __x.swap(__y); } # if _LIBCPP_STD_VER >= 20 template -inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type erase_if(forward_list<_Tp, _Allocator>& __c, _Predicate __pred) { return __c.remove_if(__pred); } template -inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type +_LIBCPP_CONSTEXPR_SINCE_CXX26 inline _LIBCPP_HIDE_FROM_ABI typename forward_list<_Tp, _Allocator>::size_type erase(forward_list<_Tp, _Allocator>& __c, const _Up& __v) { return std::erase_if(__c, [&](const auto& __elem) -> bool { return __elem == __v; }); } diff --git a/libcxx/include/version b/libcxx/include/version index 65fae111dc8ed..87c4ede9a7e59 100644 --- a/libcxx/include/version +++ b/libcxx/include/version @@ -68,6 +68,7 @@ __cpp_lib_constexpr_charconv 202207L __cpp_lib_constexpr_cmath 202202L __cpp_lib_constexpr_complex 201711L __cpp_lib_constexpr_dynamic_alloc 201907L +__cpp_lib_constexpr_forward_list 202502L __cpp_lib_constexpr_functional 201907L __cpp_lib_constexpr_iterator 201811L __cpp_lib_constexpr_memory 202202L @@ -543,6 +544,7 @@ __cpp_lib_void_t 201411L # define __cpp_lib_bitset 202306L # undef __cpp_lib_constexpr_algorithms # define __cpp_lib_constexpr_algorithms 202306L +# define __cpp_lib_constexpr_forward_list 202502L # if !defined(_LIBCPP_ABI_VCRUNTIME) # define __cpp_lib_constexpr_new 202406L # endif diff --git a/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp index 52adfc4d85985..a9ef855e9a73e 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/compare.three_way.pass.cpp @@ -11,7 +11,7 @@ // template // synth-three-way-result operator<=>(const forward_list& x, -// const forward_list& y); +// const forward_list& y); // constexpr since C++26 #include #include @@ -20,6 +20,9 @@ int main(int, char**) { assert(test_sequence_container_spaceship()); - // `std::forward_list` is not constexpr, so no `static_assert` test here. +#if TEST_STD_VER >= 26 + static_assert(test_sequence_container_spaceship()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp index dbc0631d11930..4482d26f308a6 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/empty.pass.cpp @@ -10,7 +10,7 @@ // class forward_list -// bool empty() const noexcept; +// bool empty() const noexcept; // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef std::forward_list C; C c; @@ -42,5 +42,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp index 757db7d957f5f..50b549f17d561 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.access/front.pass.cpp @@ -8,17 +8,18 @@ // -// reference front(); -// const_reference front() const; +// reference front(); // constexpr since C++26 +// const_reference front() const; // constexpr since C++26 #include #include #include +#include "test_allocator.h" #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -58,5 +59,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp index 31893a1b95994..4645560048cf6 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.compile.fail.cpp @@ -8,7 +8,7 @@ // -// explicit forward_list(const allocator_type& a); +// explicit forward_list(const allocator_type& a); // constexpr since C++26 #include #include @@ -16,7 +16,7 @@ #include "test_allocator.h" #include "../../../NotConstructible.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef test_allocator A; typedef A::value_type T; @@ -26,5 +26,14 @@ int main(int, char**) { assert(c.empty()); } + return true; +} + +int main(int, char**) { + test(); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp index bfb330fdaf9fc..ffc6d37f28160 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/alloc.pass.cpp @@ -8,7 +8,7 @@ // -// explicit forward_list(const allocator_type& a); +// explicit forward_list(const allocator_type& a); // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "../../../NotConstructible.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef test_allocator A; typedef A::value_type T; @@ -46,5 +46,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp index 27d450c63dcae..b99af4ccb79ec 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_copy.pass.cpp @@ -8,7 +8,7 @@ // -// forward_list& operator=(const forward_list& x); +// forward_list& operator=(const forward_list& x); // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef test_allocator A; @@ -143,5 +143,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp index 1cdcca82d3352..ea2802b323a91 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_init.pass.cpp @@ -10,7 +10,7 @@ // -// void assign(initializer_list il); +// void assign(initializer_list il); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -65,5 +65,14 @@ int main(int, char**) { assert(n == 4); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp index 998a7e11ef343..9c88db6166ba7 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_move.pass.cpp @@ -10,7 +10,7 @@ // -// forward_list& operator=(forward_list&& x); +// forward_list& operator=(forward_list&& x); // constexpr since C++26 #include #include @@ -21,7 +21,7 @@ #include "MoveOnly.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef MoveOnly T; typedef test_allocator A; @@ -194,5 +194,14 @@ int main(int, char**) { assert(c0.empty()); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp index a22d6c4985bc5..d21898dc4663a 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_op_init.pass.cpp @@ -10,7 +10,7 @@ // -// forward_list& operator=(initializer_list il); +// forward_list& operator=(initializer_list il); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -65,5 +65,14 @@ int main(int, char**) { assert(n == 4); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp index 9a35328740790..1601b4b47acd1 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_range.pass.cpp @@ -9,7 +9,7 @@ // // template -// void assign(InputIterator first, InputIterator last); +// void assign(InputIterator first, InputIterator last); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_iterators.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -75,5 +75,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp index b0fbfa3249e5e..75626b47c5273 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/assign_size_value.pass.cpp @@ -8,7 +8,7 @@ // -// void assign(size_type n, const value_type& v); +// void assign(size_type n, const value_type& v); // constexpr since C++26 #include #include @@ -17,7 +17,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -65,5 +65,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp index 22d5054b9ae18..12d701bff4b68 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy.pass.cpp @@ -8,7 +8,7 @@ // -// forward_list(const forward_list& x); +// forward_list(const forward_list& x); // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef test_allocator A; @@ -64,5 +64,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp index a61233e4b5d22..fc3ff485b0667 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/copy_alloc.pass.cpp @@ -8,7 +8,7 @@ // -// forward_list(const forward_list& x, const allocator_type& a); +// forward_list(const forward_list& x, const allocator_type& a); // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef test_allocator A; @@ -64,5 +64,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp index b493a89b78003..e0ea8bf66cb3b 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/default.pass.cpp @@ -8,7 +8,7 @@ // -// forward_list(); +// forward_list(); // constexpr since C++26 #include #include @@ -16,7 +16,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -38,5 +38,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp index 312f6dbad3550..d1e1734e86f9f 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/from_range.pass.cpp @@ -9,14 +9,14 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // template R> -// forward_list(from_range_t, R&& rg, const Allocator& = Allocator()); // C++23 +// forward_list(from_range_t, R&& rg, const Allocator& = Allocator()); // C++23; constexpr since C++26 #include #include "../../from_range_sequence_containers.h" #include "test_macros.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { for_all_iterators_and_allocators([]() { test_sequence_container([](const auto&) { // No additional validation to do. @@ -26,8 +26,19 @@ int main(int, char**) { static_assert(test_constraints()); - test_exception_safety_throwing_copy(); - test_exception_safety_throwing_allocator(); + if (!TEST_IS_CONSTANT_EVALUATED) { + test_exception_safety_throwing_copy(); + test_exception_safety_throwing_allocator(); + } + + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp index b42242b0a83d4..b7acf60aa70cc 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init.pass.cpp @@ -10,7 +10,7 @@ // -// forward_list(initializer_list il); +// forward_list(initializer_list il); // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -38,5 +38,14 @@ int main(int, char**) { assert(n == 10); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp index 0b29cbfa9254d..33d569c921a94 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/init_alloc.pass.cpp @@ -10,7 +10,7 @@ // -// forward_list(initializer_list il, const allocator_type& a); +// forward_list(initializer_list il, const allocator_type& a); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef test_allocator A; @@ -43,5 +43,14 @@ int main(int, char**) { assert(c.get_allocator() == A()); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp index 762e252ca76fe..20575479f7357 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move.pass.cpp @@ -10,7 +10,7 @@ // -// forward_list(forward_list&& x); +// forward_list(forward_list&& x); // constexpr since C++26 #include #include @@ -21,7 +21,7 @@ #include "MoveOnly.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef MoveOnly T; typedef test_allocator A; @@ -68,5 +68,14 @@ int main(int, char**) { assert(c.get_allocator() == A()); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp index a9bc2cb12f288..219505bf4fd17 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/move_alloc.pass.cpp @@ -10,7 +10,7 @@ // -// forward_list(forward_list&& x, const allocator_type& a); +// forward_list(forward_list&& x, const allocator_type& a); // constexpr since C++26 #include #include @@ -21,7 +21,7 @@ #include "MoveOnly.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef MoveOnly T; typedef test_allocator A; @@ -68,5 +68,14 @@ int main(int, char**) { assert(c.get_allocator() == A()); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp index ebd0e6a5bd1e0..61393eb28938e 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range.pass.cpp @@ -9,7 +9,7 @@ // // template -// forward_list(InputIterator first, InputIterator last); +// forward_list(InputIterator first, InputIterator last); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_iterators.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -45,5 +45,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp index 4a28041ad2cbc..c0637420e328a 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/range_alloc.pass.cpp @@ -10,7 +10,7 @@ // template // forward_list(InputIterator first, InputIterator last, -// const allocator_type& a); +// const allocator_type& a); // constexpr since C++26 #include #include @@ -21,7 +21,7 @@ #include "test_iterators.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef test_allocator A; @@ -51,5 +51,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp index 81b128d2149e3..206854560c19f 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size.pass.cpp @@ -8,8 +8,8 @@ // -// explicit forward_list(size_type n); -// explicit forward_list(size_type n, const Alloc& a); +// explicit forward_list(size_type n); // constexpr since C++26 +// explicit forward_list(size_type n, const Alloc& a); // constexpr since C++26 #include #include diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp index 663422d1c3c30..85d11e3f40a2f 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value.pass.cpp @@ -8,7 +8,7 @@ // -// forward_list(size_type n, const value_type& v); +// forward_list(size_type n, const value_type& v); // constexpr since C++26 #include #include @@ -16,7 +16,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -42,5 +42,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp index af7f7471d4c98..abcdf62452b89 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.cons/size_value_alloc.pass.cpp @@ -8,7 +8,7 @@ // -// forward_list(size_type n, const value_type& v, const allocator_type& a); +// forward_list(size_type n, const value_type& v, const allocator_type& a); // constexpr since C++26 #include #include @@ -17,7 +17,7 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef test_allocator A; typedef A::value_type T; @@ -47,5 +47,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp index 1044d779220ee..86d7769fe16ee 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase.pass.cpp @@ -11,7 +11,7 @@ // template // typename forward_list::size_type -// erase(forward_list& c, const U& value); +// erase(forward_list& c, const U& value); // constexpr since C++26 #include #include @@ -21,14 +21,14 @@ #include "min_allocator.h" template -void test0(S s, U val, S expected, std::size_t expected_erased_count) { +TEST_CONSTEXPR_CXX26 void test0(S s, U val, S expected, std::size_t expected_erased_count) { ASSERT_SAME_TYPE(typename S::size_type, decltype(std::erase(s, val))); assert(expected_erased_count == std::erase(s, val)); assert(s == expected); } template -void test() { +TEST_CONSTEXPR_CXX26 void test() { test0(S(), 1, S(), 0); test0(S({1}), 1, S(), 1); @@ -62,13 +62,21 @@ void test() { test0(S({1, 2, 1}), opt(3), S({1, 2, 1}), 0); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { test>(); test>>(); test>>(); - test>(); test>(); + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp index c4f45a1069a2b..c665f9cccbf0a 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.erasure/erase_if.pass.cpp @@ -11,7 +11,7 @@ // template // typename forward_list::size_type -// erase_if(forward_list& c, Predicate pred); +// erase_if(forward_list& c, Predicate pred); // constexpr since C++26 #include @@ -20,14 +20,14 @@ #include "min_allocator.h" template -void test0(S s, Pred p, S expected, std::size_t expected_erased_count) { +TEST_CONSTEXPR_CXX26 void test0(S s, Pred p, S expected, std::size_t expected_erased_count) { ASSERT_SAME_TYPE(typename S::size_type, decltype(std::erase_if(s, p))); assert(expected_erased_count == std::erase_if(s, p)); assert(s == expected); } template -void test() { +TEST_CONSTEXPR_CXX26 void test() { auto is1 = [](auto v) { return v == 1; }; auto is2 = [](auto v) { return v == 2; }; auto is3 = [](auto v) { return v == 3; }; @@ -64,13 +64,21 @@ void test() { test0(S({1, 2, 3}), False, S({1, 2, 3}), 0); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { test>(); test>>(); test>>(); - test>(); test>(); + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp index d66d2cd879515..52b5d87860aab 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/before_begin.pass.cpp @@ -8,9 +8,9 @@ // -// iterator before_begin(); -// const_iterator before_begin() const; -// const_iterator cbefore_begin() const; +// iterator before_begin(); // constexpr since C++26 +// const_iterator before_begin() const; // constexpr since C++26 +// const_iterator cbefore_begin() const; // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -101,5 +101,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp index 135689b2321c3..560c47b17958f 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.iter/iterators.pass.cpp @@ -8,12 +8,12 @@ // -// iterator begin(); -// iterator end(); -// const_iterator begin() const; -// const_iterator end() const; -// const_iterator cbegin() const; -// const_iterator cend() const; +// iterator begin(); // constexpr since C++26 +// iterator end(); // constexpr since C++26 +// const_iterator begin() const; // constexpr since C++26 +// const_iterator end() const; // constexpr since C++26 +// const_iterator cbegin() const; // constexpr since C++26 +// const_iterator cend() const; // constexpr since C++26 #include #include @@ -22,7 +22,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -69,6 +69,8 @@ int main(int, char**) { typedef std::forward_list C; C::iterator i; C::const_iterator j; + (void)i; + (void)j; } #if TEST_STD_VER >= 11 { @@ -117,6 +119,8 @@ int main(int, char**) { typedef std::forward_list> C; C::iterator i; C::const_iterator j; + (void)i; + (void)j; } #endif #if TEST_STD_VER > 11 @@ -142,5 +146,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp index a27cc757025b5..9a3adec1d9756 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/assign_range.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // template R> -// constexpr void prepend_range(R&& rg); // C++23 +// constexpr void prepend_range(R&& rg); // C++23; constexpr since C++26 #include @@ -21,7 +21,7 @@ // {empty/one-element/full} container); // - prepending move-only elements; // - an exception is thrown when copying the elements or when allocating new elements. -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { static_assert(test_constraints_assign_range()); for_all_iterators_and_allocators([]() { @@ -31,8 +31,19 @@ int main(int, char**) { }); test_sequence_prepend_range_move_only(); - test_prepend_range_exception_safety_throwing_copy(); - test_prepend_range_exception_safety_throwing_allocator(); + if (!TEST_IS_CONSTANT_EVALUATED) { + test_prepend_range_exception_safety_throwing_copy(); + test_prepend_range_exception_safety_throwing_allocator(); + } + + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp index 9f6d34b701df7..2e1768cf8bad9 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/clear.pass.cpp @@ -8,7 +8,7 @@ // -// void clear() noexcept; +// void clear() noexcept; // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "../../../NotConstructible.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef NotConstructible T; typedef std::forward_list C; @@ -64,5 +64,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp index f77d47ee7c74f..6433607af9b39 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_after.pass.cpp @@ -11,7 +11,7 @@ // // template -// iterator emplace_after(const_iterator p, Args&&... args); +// iterator emplace_after(const_iterator p, Args&&... args); // constexpr since C++26 #include #include @@ -20,7 +20,7 @@ #include "../../../Emplaceable.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef Emplaceable T; typedef std::forward_list C; @@ -84,5 +84,14 @@ int main(int, char**) { assert(std::distance(c.begin(), c.end()) == 4); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp index cd3bb20c52ae5..46ae27b43622e 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/emplace_front.pass.cpp @@ -10,7 +10,7 @@ // -// template reference emplace_front(Args&&... args); +// template reference emplace_front(Args&&... args); // constexpr since C++26 // return type is 'reference' in C++17; 'void' before #include @@ -21,7 +21,7 @@ #include "../../../Emplaceable.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef Emplaceable T; typedef std::forward_list C; @@ -67,5 +67,14 @@ int main(int, char**) { assert(std::distance(c.begin(), c.end()) == 2); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp index e85951798526d..73cb03c2cb7d2 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_many.pass.cpp @@ -8,7 +8,7 @@ // -// iterator erase_after(const_iterator first, const_iterator last); +// iterator erase_after(const_iterator first, const_iterator last); // constexpr since C++26 #include #include @@ -17,7 +17,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -153,5 +153,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp index 892228e76def7..12997f1dad3b9 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/erase_after_one.pass.cpp @@ -8,7 +8,7 @@ // -// iterator erase_after(const_iterator p); +// iterator erase_after(const_iterator p); // constexpr since C++26 #include #include @@ -17,7 +17,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -95,5 +95,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp index 8443158413e7f..d93789dd6bb5c 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_const.pass.cpp @@ -8,7 +8,7 @@ // -// iterator insert_after(const_iterator p, const value_type& v); +// iterator insert_after(const_iterator p, const value_type& v); // constexpr since C++26 #include #include @@ -16,7 +16,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -84,5 +84,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp index de924a10c18f0..54be47f4264ff 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_init.pass.cpp @@ -10,7 +10,7 @@ // -// iterator insert_after(const_iterator p, initializer_list il); +// iterator insert_after(const_iterator p, initializer_list il); // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -70,5 +70,14 @@ int main(int, char**) { assert(*std::next(c.begin(), 4) == 2); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp index af810d0f6961c..f89fbd7619da2 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_range.pass.cpp @@ -10,7 +10,7 @@ // template // iterator insert_after(const_iterator p, -// InputIterator first, InputIterator last); +// InputIterator first, InputIterator last); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_iterators.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -77,5 +77,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp index acd4bc73f724e..01b76f5cd64f1 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_rv.pass.cpp @@ -10,7 +10,7 @@ // -// iterator insert_after(const_iterator p, value_type&& v); +// iterator insert_after(const_iterator p, value_type&& v); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "MoveOnly.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef MoveOnly T; typedef std::forward_list C; @@ -85,5 +85,14 @@ int main(int, char**) { assert(std::distance(c.begin(), c.end()) == 4); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp index 2506f04311e0e..f4f0521ad2371 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_after_size_value.pass.cpp @@ -8,7 +8,7 @@ // -// iterator insert_after(const_iterator p, size_type n, const value_type& v); +// iterator insert_after(const_iterator p, size_type n, const value_type& v); // constexpr since C++26 #include #include @@ -16,7 +16,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -70,5 +70,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp index 25f4c43f38486..71a291430b435 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/insert_range_after.pass.cpp @@ -8,8 +8,10 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000 + // template R> -// constexpr iterator insert_range_after(const_iterator position, R&& rg); // C++23 +// constexpr iterator insert_range_after(const_iterator position, R&& rg); // C++23; constexpr since C++26 #include @@ -321,7 +323,7 @@ constexpr void test_sequence_insert_range_after() { } } -void test_sequence_insert_range_after_move_only() { +TEST_CONSTEXPR_CXX26 void test_sequence_insert_range_after_move_only() { MoveOnly input[5]; std::ranges::subrange in(std::move_iterator{input}, std::move_iterator{input + 5}); @@ -366,7 +368,7 @@ void test_insert_range_after_exception_safety_throwing_allocator() { #endif } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { static_assert(test_constraints_insert_range_after()); for_all_iterators_and_allocators([]() { @@ -374,8 +376,19 @@ int main(int, char**) { }); test_sequence_insert_range_after_move_only(); - test_insert_range_after_exception_safety_throwing_copy(); - test_insert_range_after_exception_safety_throwing_allocator(); + if (!TEST_IS_CONSTANT_EVALUATED) { + test_insert_range_after_exception_safety_throwing_copy(); + test_insert_range_after_exception_safety_throwing_allocator(); + } + + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp index 98c7a26341179..9fcade7ff6bba 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/pop_front.pass.cpp @@ -8,7 +8,7 @@ // -// void pop_front(); +// void pop_front(); // constexpr since C++26 #include #include @@ -17,7 +17,7 @@ #include "MoveOnly.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -71,5 +71,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp index 418aa72052ba9..c4b9cd9bdfc41 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/prepend_range.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // template R> -// constexpr void prepend_range(R&& rg); // C++23 +// constexpr void prepend_range(R&& rg); // C++23; constexpr since C++26 #include @@ -21,7 +21,7 @@ // {empty/one-element/full} container); // - prepending move-only elements; // - an exception is thrown when copying the elements or when allocating new elements. -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { static_assert(test_constraints_prepend_range()); for_all_iterators_and_allocators([]() { @@ -31,8 +31,19 @@ int main(int, char**) { }); test_sequence_prepend_range_move_only(); - test_prepend_range_exception_safety_throwing_copy(); - test_prepend_range_exception_safety_throwing_allocator(); + if (!TEST_IS_CONSTANT_EVALUATED) { + test_prepend_range_exception_safety_throwing_copy(); + test_prepend_range_exception_safety_throwing_allocator(); + } + + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp index f99c40fa0c1a0..61c5dcac0545e 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_const.pass.cpp @@ -8,7 +8,7 @@ // -// void push_front(const value_type& v); +// void push_front(const value_type& v); // constexpr since C++26 #include #include @@ -16,7 +16,7 @@ #include "test_macros.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -44,5 +44,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp index 467037465eedd..cd24d6ff6af06 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_exception_safety.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: no-exceptions // -// void push_front(const value_type& x); +// void push_front(const value_type& x); // constexpr since C++26 #include #include diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp index d3156c5fdd38a..b30ff7a0189e2 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/push_front_rv.pass.cpp @@ -10,7 +10,7 @@ // -// void push_front(value_type&& v); +// void push_front(value_type&& v); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "MoveOnly.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef MoveOnly T; typedef std::forward_list C; @@ -45,5 +45,14 @@ int main(int, char**) { assert(std::distance(c.begin(), c.end()) == 2); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp index 2dacf458d7d9d..f80886113bf25 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size.pass.cpp @@ -8,7 +8,7 @@ // -// void resize(size_type n); +// void resize(size_type n); // constexpr since C++26 #include #include @@ -18,8 +18,8 @@ #include "DefaultOnly.h" #include "min_allocator.h" -int main(int, char**) { - { +TEST_CONSTEXPR_CXX26 bool test() { + if (!TEST_IS_CONSTANT_EVALUATED) { typedef DefaultOnly T; typedef std::forward_list C; C c; @@ -65,7 +65,7 @@ int main(int, char**) { assert(*std::next(c.begin(), 5) == 0); } #if TEST_STD_VER >= 11 - { + if (!TEST_IS_CONSTANT_EVALUATED) { typedef DefaultOnly T; typedef std::forward_list> C; C c; @@ -112,5 +112,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp index a6af763e6937f..4ec859b36336d 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.modifiers/resize_size_value.pass.cpp @@ -8,7 +8,7 @@ // -// void resize(size_type n, const value_type& v); +// void resize(size_type n, const value_type& v); // constexpr since C++26 #include #include @@ -22,7 +22,7 @@ # include "container_test_types.h" #endif -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -84,7 +84,7 @@ int main(int, char**) { assert(*std::next(c.begin(), 4) == 10); assert(*std::next(c.begin(), 5) == 10); } - { + if (!TEST_IS_CONSTANT_EVALUATED) { // Test that the allocator's construct method is being used to // construct the new elements and that it's called exactly N times. typedef std::forward_list> Container; @@ -99,5 +99,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp index 9a162789569d3..d8e80c56bf392 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue.pass.cpp @@ -8,7 +8,7 @@ // -// void merge(forward_list& x); +// void merge(forward_list& x); // constexpr since C++26 #include #include @@ -30,11 +30,11 @@ struct value { int a; int b; - friend bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; } - friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } + friend TEST_CONSTEXPR bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; } + friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } }; -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { // Basic merge operation. typedef int T; typedef std::forward_list C; @@ -116,5 +116,14 @@ int main(int, char**) { assert(c == std::forward_list(std::begin(a), std::end(a))); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp index 4e1814044808c..0adadb2dd092f 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_lvalue_pred.pass.cpp @@ -8,7 +8,7 @@ // -// template void merge(forward_list& x, Compare comp); +// template void merge(forward_list& x, Compare comp); // constexpr since C++26 #include #include @@ -30,11 +30,11 @@ struct value { int a; int b; - friend bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; } - friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } + friend TEST_CONSTEXPR bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; } + friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } }; -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { // Basic merge operation. typedef int T; typedef std::forward_list C; @@ -117,5 +117,14 @@ int main(int, char**) { assert(c == std::forward_list(std::begin(a), std::end(a))); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp index acfa014fe2546..906748ec2702b 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue.pass.cpp @@ -10,7 +10,7 @@ // -// void merge(forward_list&& x); +// void merge(forward_list&& x); // constexpr since C++26 #include #include @@ -29,11 +29,11 @@ struct value { int a; int b; - friend bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; } - friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } + friend TEST_CONSTEXPR bool operator<(const value& lhs, const value& rhs) { return lhs.a < rhs.a; } + friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } }; -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { // Basic merge operation. typedef int T; typedef std::forward_list C; @@ -109,5 +109,14 @@ int main(int, char**) { assert(c == std::forward_list(std::begin(a), std::end(a))); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp index 41b56ce7a2884..2ced0b1596e4d 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/merge_rvalue_pred.pass.cpp @@ -10,7 +10,7 @@ // -// template void merge(forward_list&& x, Compare comp); +// template void merge(forward_list&& x, Compare comp); // constexpr since C++26 #include #include @@ -29,11 +29,11 @@ struct value { int a; int b; - friend bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; } - friend bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } + friend TEST_CONSTEXPR bool operator>(const value& lhs, const value& rhs) { return lhs.a > rhs.a; } + friend TEST_CONSTEXPR bool operator==(const value& lhs, const value& rhs) { return lhs.a == rhs.a && lhs.b == rhs.b; } }; -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { // Basic merge operation. typedef int T; typedef std::forward_list C; @@ -110,5 +110,14 @@ int main(int, char**) { assert(c == std::forward_list(std::begin(a), std::end(a))); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp index ec3bf845dcc5a..b17708ba60ee6 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove.pass.cpp @@ -9,7 +9,7 @@ // // void remove(const value_type& v); // C++17 and before -// size_type remove(const value_type& v); // C++20 and after +// size_type remove(const value_type& v); // C++20 and after; // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "min_allocator.h" template -void do_remove(L& l, const typename L::value_type& value, typename L::size_type expected) { +TEST_CONSTEXPR_CXX26 void do_remove(L& l, const typename L::value_type& value, typename L::size_type expected) { typename L::size_type old_size = std::distance(l.begin(), l.end()); #if TEST_STD_VER > 17 ASSERT_SAME_TYPE(decltype(l.remove(value)), typename L::size_type); @@ -32,22 +32,22 @@ void do_remove(L& l, const typename L::value_type& value, typename L::size_type } struct S { - S(int i) : i_(new int(i)) {} - S(const S& rhs) : i_(new int(*rhs.i_)) {} - S& operator=(const S& rhs) { + TEST_CONSTEXPR_CXX20 S(int i) : i_(new int(i)) {} + TEST_CONSTEXPR_CXX20 S(const S& rhs) : i_(new int(*rhs.i_)) {} + TEST_CONSTEXPR_CXX20 S& operator=(const S& rhs) { *i_ = *rhs.i_; return *this; } - ~S() { + TEST_CONSTEXPR_CXX20 ~S() { delete i_; i_ = NULL; } - bool operator==(const S& rhs) const { return *i_ == *rhs.i_; } - int get() const { return *i_; } + TEST_CONSTEXPR bool operator==(const S& rhs) const { return *i_ == *rhs.i_; } + TEST_CONSTEXPR int get() const { return *i_; } int* i_; }; -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -171,5 +171,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp index c6325baea2590..f26205d03f645 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/remove_if.pass.cpp @@ -9,7 +9,7 @@ // // template void remove_if(Predicate pred); // C++17 and before -// template size_type remove_if(Predicate pred); // C++20 and after +// template size_type remove_if(Predicate pred); // C++20 and after; constexpr since C++26 #include #include @@ -22,7 +22,7 @@ #include "counting_predicates.h" template -void do_remove_if(L& l, Predicate pred, typename L::size_type expected) { +TEST_CONSTEXPR_CXX26 void do_remove_if(L& l, Predicate pred, typename L::size_type expected) { typename L::size_type old_size = std::distance(l.begin(), l.end()); #if TEST_STD_VER > 17 ASSERT_SAME_TYPE(decltype(l.remove_if(pred)), typename L::size_type); @@ -34,18 +34,18 @@ void do_remove_if(L& l, Predicate pred, typename L::size_type expected) { assert(old_size - std::distance(l.begin(), l.end()) == expected); } -bool g(int i) { return i < 3; } +TEST_CONSTEXPR bool g(int i) { return i < 3; } struct PredLWG526 { - PredLWG526(int i) : i_(i) {} - ~PredLWG526() { i_ = -32767; } - bool operator()(const PredLWG526& p) const { return p.i_ == i_; } + TEST_CONSTEXPR_CXX20 PredLWG526(int i) : i_(i) {} + TEST_CONSTEXPR_CXX20 ~PredLWG526() { i_ = -32767; } + TEST_CONSTEXPR bool operator()(const PredLWG526& p) const { return p.i_ == i_; } - bool operator==(int i) const { return i == i_; } + TEST_CONSTEXPR bool operator==(int i) const { return i == i_; } int i_; }; -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef unary_counting_predicate Predicate; @@ -187,5 +187,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp index 0d0656897f34e..38f0e74f66323 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/reverse.pass.cpp @@ -8,7 +8,7 @@ // -// void reverse(); +// void reverse(); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "min_allocator.h" template -void test(int N) { +TEST_CONSTEXPR_CXX26 void test1(int N) { C c; for (int i = 0; i < N; ++i) c.push_front(i); @@ -30,12 +30,21 @@ void test(int N) { assert(*j == i); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { for (int i = 0; i < 10; ++i) - test >(i); + test1 >(i); #if TEST_STD_VER >= 11 for (int i = 0; i < 10; ++i) - test> >(i); + test1> >(i); +#endif + + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); #endif return 0; diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp index 4c91d7397adf0..f8787d70784d1 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_flist.pass.cpp @@ -8,7 +8,7 @@ // -// void splice_after(const_iterator p, forward_list&& x); +// void splice_after(const_iterator p, forward_list&& x); // constexpr since C++26 #include #include @@ -19,13 +19,13 @@ #include "min_allocator.h" typedef int T; -const T t1[] = {0, 1, 2, 3, 4, 5, 6, 7}; -const T t2[] = {10, 11, 12, 13, 14, 15}; -const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1); -const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2); +TEST_CONSTEXPR const T t1[] = {0, 1, 2, 3, 4, 5, 6, 7}; +TEST_CONSTEXPR const T t2[] = {10, 11, 12, 13, 14, 15}; +TEST_CONSTEXPR const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1); +TEST_CONSTEXPR const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2); template -void testd(const C& c, int p, int l) { +TEST_CONSTEXPR_CXX26 void testd(const C& c, int p, int l) { typename C::const_iterator i = c.begin(); int n1 = 0; for (; n1 < p; ++n1, ++i) @@ -37,7 +37,7 @@ void testd(const C& c, int p, int l) { assert(std::distance(c.begin(), c.end()) == size_t1 + l); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { // splicing different containers typedef std::forward_list C; @@ -67,5 +67,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp index bb8bdea632547..7202b0e153627 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_one.pass.cpp @@ -8,7 +8,7 @@ // -// void splice_after(const_iterator p, forward_list&& x, const_iterator i); +// void splice_after(const_iterator p, forward_list&& x, const_iterator i); // constexpr since C++26 #include #include @@ -19,13 +19,13 @@ #include "min_allocator.h" typedef int T; -const T t1[] = {0, 1, 2, 3, 4, 5, 6, 7}; -const T t2[] = {10, 11, 12}; -const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1); -const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2); +TEST_CONSTEXPR const T t1[] = {0, 1, 2, 3, 4, 5, 6, 7}; +TEST_CONSTEXPR const T t2[] = {10, 11, 12}; +TEST_CONSTEXPR const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1); +TEST_CONSTEXPR const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2); template -void testd(const C& c, int p, int f) { +TEST_CONSTEXPR_CXX26 void testd(const C& c, int p, int f) { typename C::const_iterator i = c.begin(); int n1 = 0; for (; n1 < p; ++n1, ++i) @@ -38,7 +38,7 @@ void testd(const C& c, int p, int f) { } template -void tests(const C& c, int p, int f) { +TEST_CONSTEXPR_CXX26 void tests(const C& c, int p, int f) { typename C::const_iterator i = c.begin(); int n = 0; if (p == f || p == f + 1) { @@ -67,7 +67,7 @@ void tests(const C& c, int p, int f) { assert(std::distance(c.begin(), c.end()) == size_t1); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { // splicing different containers typedef std::forward_list C; @@ -117,5 +117,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp index 99b3ed1c7836b..18da6f12b28da 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/splice_after_range.pass.cpp @@ -8,8 +8,10 @@ // +// ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=3000000 + // void splice_after(const_iterator p, forward_list&& x, -// const_iterator first, const_iterator last); +// const_iterator first, const_iterator last); // constexpr since C++26 #include #include @@ -20,13 +22,13 @@ #include "min_allocator.h" typedef std::ptrdiff_t T; -const T t1[] = {0, 1, 2, 3, 4, 5, 6, 7}; -const T t2[] = {10, 11, 12, 13, 14, 15}; -const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1); -const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2); +TEST_CONSTEXPR const T t1[] = {0, 1, 2, 3, 4, 5, 6, 7}; +TEST_CONSTEXPR const T t2[] = {10, 11, 12, 13, 14, 15}; +TEST_CONSTEXPR const std::ptrdiff_t size_t1 = std::end(t1) - std::begin(t1); +TEST_CONSTEXPR const std::ptrdiff_t size_t2 = std::end(t2) - std::begin(t2); template -void testd(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) { +TEST_CONSTEXPR_CXX26 void testd(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) { typename C::const_iterator i = c.begin(); std::ptrdiff_t n1 = 0; for (; n1 < p; ++n1, ++i) @@ -39,7 +41,7 @@ void testd(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) { } template -void tests(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) { +TEST_CONSTEXPR_CXX26 void tests(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) { typename C::const_iterator i = c.begin(); std::ptrdiff_t n = 0; std::ptrdiff_t d = l > f + 1 ? l - 1 - f : 0; @@ -69,7 +71,7 @@ void tests(const C& c, std::ptrdiff_t p, ptrdiff_t f, ptrdiff_t l) { assert(std::distance(c.begin(), c.end()) == size_t1); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { // splicing different containers typedef std::forward_list C; @@ -157,5 +159,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp index ebd1a79cdb4bc..28efff3849e68 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique.pass.cpp @@ -9,7 +9,7 @@ // // void unique(); // C++17 and before -// size_type unique(); // C++20 and after +// size_type unique(); // C++20 and after; constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "min_allocator.h" template -void do_unique(L& l, typename L::size_type expected) { +TEST_CONSTEXPR_CXX26 void do_unique(L& l, typename L::size_type expected) { typename L::size_type old_size = std::distance(l.begin(), l.end()); #if TEST_STD_VER > 17 ASSERT_SAME_TYPE(decltype(l.unique()), typename L::size_type); @@ -31,7 +31,7 @@ void do_unique(L& l, typename L::size_type expected) { assert(old_size - std::distance(l.begin(), l.end()) == expected); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -131,5 +131,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp index 408cbf6ae9c20..f07142dffe9d9 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.ops/unique_pred.pass.cpp @@ -9,7 +9,7 @@ // // template void unique(BinaryPredicate binary_pred); // C++17 and before -// template size_type unique(BinaryPredicate binary_pred); // C++20 and after +// template size_type unique(BinaryPredicate binary_pred); // C++20 and after; constexpr since C++26 #include #include @@ -20,7 +20,7 @@ #include "min_allocator.h" template -void do_unique(L& l, Predicate pred, typename L::size_type expected) { +TEST_CONSTEXPR_CXX26 void do_unique(L& l, Predicate pred, typename L::size_type expected) { typename L::size_type old_size = std::distance(l.begin(), l.end()); #if TEST_STD_VER > 17 ASSERT_SAME_TYPE(decltype(l.unique(pred)), typename L::size_type); @@ -33,17 +33,17 @@ void do_unique(L& l, Predicate pred, typename L::size_type expected) { } struct PredLWG526 { - PredLWG526(int i) : i_(i) {} - ~PredLWG526() { i_ = -32767; } - bool operator()(const PredLWG526& lhs, const PredLWG526& rhs) const { return lhs.i_ == rhs.i_; } + TEST_CONSTEXPR_CXX20 PredLWG526(int i) : i_(i) {} + TEST_CONSTEXPR_CXX20 ~PredLWG526() { i_ = -32767; } + TEST_CONSTEXPR bool operator()(const PredLWG526& lhs, const PredLWG526& rhs) const { return lhs.i_ == rhs.i_; } - bool operator==(int i) const { return i == i_; } + TEST_CONSTEXPR bool operator==(int i) const { return i == i_; } int i_; }; -bool g(int x, int y) { return x == y; } +TEST_CONSTEXPR bool g(int x, int y) { return x == y; } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef std::forward_list C; @@ -157,5 +157,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp index ef6b72ee360a9..cb57b094a077d 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/equal.pass.cpp @@ -10,11 +10,11 @@ // template // bool operator==(const forward_list& x, -// const forward_list& y); +// const forward_list& y); // constexpr since C++26 // // template // bool operator!=(const forward_list& x, -// const forward_list& y); +// const forward_list& y); // constexpr since C++26 #include #include @@ -25,7 +25,7 @@ #include "min_allocator.h" template -void test(int N, int M) { +TEST_CONSTEXPR_CXX26 void test(int N, int M) { C c1; for (int i = 0; i < N; ++i) c1.push_front(i); @@ -44,7 +44,7 @@ void test(int N, int M) { } } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { for (int i = 0; i < 10; ++i) for (int j = 0; j < 10; ++j) test >(i, j); @@ -54,5 +54,14 @@ int main(int, char**) { test> >(i, j); #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp index e50f9e6e9e473..f4f7c6d1f7e53 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/member_swap.pass.cpp @@ -8,7 +8,7 @@ // -// void swap(forward_list& x); +// void swap(forward_list& x); // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef test_allocator A; @@ -257,5 +257,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp index cae6950436dee..ce25479781547 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/non_member_swap.pass.cpp @@ -9,7 +9,7 @@ // // template -// void swap(forward_list& x, forward_list& y); +// void swap(forward_list& x, forward_list& y); // constexpr since C++26 #include #include @@ -19,7 +19,7 @@ #include "test_allocator.h" #include "min_allocator.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef int T; typedef test_allocator A; @@ -258,5 +258,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp index d16acadaeb893..7bf80ca026e8e 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/relational.pass.cpp @@ -10,19 +10,19 @@ // template // bool operator< (const forward_list& x, -// const forward_list& y); +// const forward_list& y); // constexpr since C++26 // // template // bool operator> (const forward_list& x, -// const forward_list& y); +// const forward_list& y); // constexpr since C++26 // // template // bool operator>=(const forward_list& x, -// const forward_list& y); +// const forward_list& y); // constexpr since C++26 // // template // bool operator<=(const forward_list& x, -// const forward_list& y); +// const forward_list& y); // constexpr since C++26 #include #include @@ -33,7 +33,7 @@ #include "min_allocator.h" template -void test(int N, int M) { +TEST_CONSTEXPR_CXX26 void test(int N, int M) { C c1; for (int i = 0; i < N; ++i) c1.push_front(i); @@ -50,7 +50,7 @@ void test(int N, int M) { assert(c1 > c2); } -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { for (int i = 0; i < 10; ++i) for (int j = 0; j < 10; ++j) test >(i, j); @@ -60,5 +60,14 @@ int main(int, char**) { test> >(i, j); #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp index b50e67589471d..02b7b471a1ae8 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/forwardlist.spec/swap_noexcept.compile.pass.cpp @@ -12,10 +12,10 @@ // void swap(forward_list& c) // noexcept(!allocator_type::propagate_on_container_swap::value || -// __is_nothrow_swappable::value); +// __is_nothrow_swappable::value); // constexpr since C++26 // // In C++17, the standard says that swap shall have: -// noexcept(is_always_equal::value); +// noexcept(is_always_equal::value); // constexpr since C++26 // This tests a conforming extension diff --git a/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp index f37f5c2f513bd..624eeb17799c0 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/get_allocator.pass.cpp @@ -10,7 +10,7 @@ // class forward_list -// allocator_type get_allocator() const +// allocator_type get_allocator() const // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_allocator.h" #include "test_macros.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { std::allocator alloc; const std::forward_list fl(alloc); @@ -30,5 +30,14 @@ int main(int, char**) { assert(fl.get_allocator() == alloc); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp index b7be03f1062dc..16c6f0b90f96d 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/incomplete.pass.cpp @@ -8,9 +8,9 @@ // -// forward_list() -// forward_list::iterator() -// forward_list::const_iterator() +// forward_list() // constexpr since C++26 +// forward_list::iterator() // constexpr since C++26 +// forward_list::const_iterator() // constexpr since C++26 #include #include @@ -33,7 +33,7 @@ struct B { }; #endif -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { A a; assert(a.d.empty()); @@ -49,5 +49,14 @@ int main(int, char**) { } #endif + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp b/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp index 5ba0d61f104e0..aab53351f00e2 100644 --- a/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp +++ b/libcxx/test/std/containers/sequences/forwardlist/max_size.pass.cpp @@ -8,7 +8,7 @@ // -// size_type max_size() const; +// size_type max_size() const; // constexpr since C++26 #include #include @@ -18,7 +18,7 @@ #include "test_allocator.h" #include "test_macros.h" -int main(int, char**) { +TEST_CONSTEXPR_CXX26 bool test() { { typedef limited_allocator A; typedef std::forward_list C; @@ -42,5 +42,14 @@ int main(int, char**) { assert(c.max_size() <= alloc_max_size(c.get_allocator())); } + return true; +} + +int main(int, char**) { + assert(test()); +#if TEST_STD_VER >= 26 + static_assert(test()); +#endif + return 0; } diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp index 31b3e900aabcd..05f903dccafe7 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/forward_list.version.compile.pass.cpp @@ -24,6 +24,10 @@ # error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifdef __cpp_lib_containers_ranges # error "__cpp_lib_containers_ranges should not be defined before c++23" # endif @@ -54,6 +58,10 @@ # error "__cpp_lib_allocator_traits_is_always_equal should not be defined before c++17" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifdef __cpp_lib_containers_ranges # error "__cpp_lib_containers_ranges should not be defined before c++23" # endif @@ -87,6 +95,10 @@ # error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++17" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifdef __cpp_lib_containers_ranges # error "__cpp_lib_containers_ranges should not be defined before c++23" # endif @@ -126,6 +138,10 @@ # error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++20" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifdef __cpp_lib_containers_ranges # error "__cpp_lib_containers_ranges should not be defined before c++23" # endif @@ -171,6 +187,10 @@ # error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++23" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifndef __cpp_lib_containers_ranges # error "__cpp_lib_containers_ranges should be defined in c++23" # endif @@ -219,6 +239,13 @@ # error "__cpp_lib_allocator_traits_is_always_equal should have the value 201411L in c++26" # endif +# ifndef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should be defined in c++26" +# endif +# if __cpp_lib_constexpr_forward_list != 202502L +# error "__cpp_lib_constexpr_forward_list should have the value 202502L in c++26" +# endif + # ifndef __cpp_lib_containers_ranges # error "__cpp_lib_containers_ranges should be defined in c++26" # endif diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp index b1cc4afd30696..a13edacd1e46a 100644 --- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp +++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp @@ -196,6 +196,10 @@ # error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifdef __cpp_lib_constexpr_functional # error "__cpp_lib_constexpr_functional should not be defined before c++20" # endif @@ -1084,6 +1088,10 @@ # error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifdef __cpp_lib_constexpr_functional # error "__cpp_lib_constexpr_functional should not be defined before c++20" # endif @@ -2074,6 +2082,10 @@ # error "__cpp_lib_constexpr_dynamic_alloc should not be defined before c++20" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifdef __cpp_lib_constexpr_functional # error "__cpp_lib_constexpr_functional should not be defined before c++20" # endif @@ -3304,6 +3316,10 @@ # error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++20" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifndef __cpp_lib_constexpr_functional # error "__cpp_lib_constexpr_functional should be defined in c++20" # endif @@ -4756,6 +4772,10 @@ # error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++23" # endif +# ifdef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should not be defined before c++26" +# endif + # ifndef __cpp_lib_constexpr_functional # error "__cpp_lib_constexpr_functional should be defined in c++23" # endif @@ -6427,6 +6447,13 @@ # error "__cpp_lib_constexpr_dynamic_alloc should have the value 201907L in c++26" # endif +# ifndef __cpp_lib_constexpr_forward_list +# error "__cpp_lib_constexpr_forward_list should be defined in c++26" +# endif +# if __cpp_lib_constexpr_forward_list != 202502L +# error "__cpp_lib_constexpr_forward_list should have the value 202502L in c++26" +# endif + # ifndef __cpp_lib_constexpr_functional # error "__cpp_lib_constexpr_functional should be defined in c++26" # endif diff --git a/libcxx/test/support/counting_predicates.h b/libcxx/test/support/counting_predicates.h index 6f34ce76302a8..8fb2db1af70d3 100644 --- a/libcxx/test/support/counting_predicates.h +++ b/libcxx/test/support/counting_predicates.h @@ -16,42 +16,44 @@ template struct unary_counting_predicate { public: - typedef Arg argument_type; - typedef bool result_type; + typedef Arg argument_type; + typedef bool result_type; - unary_counting_predicate(Predicate p) : p_(p), count_(0) {} - unary_counting_predicate(const unary_counting_predicate&) = default; - unary_counting_predicate& operator=(const unary_counting_predicate&) = default; - ~unary_counting_predicate() {} + TEST_CONSTEXPR_CXX20 unary_counting_predicate(Predicate p) : p_(p), count_(0) {} + unary_counting_predicate(const unary_counting_predicate&) = default; + unary_counting_predicate& operator=(const unary_counting_predicate&) = default; + TEST_CONSTEXPR_CXX20 ~unary_counting_predicate() {} - bool operator () (const Arg &a) const { ++count_; return p_(a); } - std::size_t count() const { return count_; } - void reset() { count_ = 0; } + TEST_CONSTEXPR_CXX14 bool operator()(const Arg& a) const { + ++count_; + return p_(a); + } + TEST_CONSTEXPR std::size_t count() const { return count_; } + TEST_CONSTEXPR_CXX14 void reset() { count_ = 0; } private: - Predicate p_; - mutable std::size_t count_; + Predicate p_; + mutable std::size_t count_; }; - -template +template struct binary_counting_predicate { public: - typedef Arg1 first_argument_type; - typedef Arg2 second_argument_type; - typedef bool result_type; - - TEST_CONSTEXPR binary_counting_predicate(Predicate p) : p_(p), count_(0) {} - TEST_CONSTEXPR_CXX14 bool operator()(const Arg1& a1, const Arg2& a2) const { - ++count_; - return p_(a1, a2); - } - TEST_CONSTEXPR std::size_t count() const { return count_; } - TEST_CONSTEXPR_CXX14 void reset() { count_ = 0; } - - private: - Predicate p_; - mutable std::size_t count_; + typedef Arg1 first_argument_type; + typedef Arg2 second_argument_type; + typedef bool result_type; + + TEST_CONSTEXPR binary_counting_predicate(Predicate p) : p_(p), count_(0) {} + TEST_CONSTEXPR_CXX14 bool operator()(const Arg1& a1, const Arg2& a2) const { + ++count_; + return p_(a1, a2); + } + TEST_CONSTEXPR std::size_t count() const { return count_; } + TEST_CONSTEXPR_CXX14 void reset() { count_ = 0; } + +private: + Predicate p_; + mutable std::size_t count_; }; #if TEST_STD_VER > 14 @@ -66,13 +68,13 @@ class counting_predicate { constexpr counting_predicate(Predicate pred, int& count) : pred_(std::move(pred)), count_(&count) {} template - constexpr decltype(auto) operator()(Args&& ...args) { + constexpr decltype(auto) operator()(Args&&... args) { ++(*count_); return pred_(std::forward(args)...); } template - constexpr decltype(auto) operator()(Args&& ...args) const { + constexpr decltype(auto) operator()(Args&&... args) const { ++(*count_); return pred_(std::forward(args)...); } diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py old mode 100755 new mode 100644 index 82f0d09db5c36..b59c7fdaf0a3d --- a/libcxx/utils/generate_feature_test_macro_components.py +++ b/libcxx/utils/generate_feature_test_macro_components.py @@ -357,6 +357,11 @@ def add_version_header(tc): "values": {"c++20": 201907}, "headers": ["memory"], }, + { + "name": "__cpp_lib_constexpr_forward_list", + "values": {"c++26": 202502}, + "headers": ["forward_list"], + }, { "name": "__cpp_lib_constexpr_functional", "values": {"c++20": 201907}, From 5188bea9afac859fa6523e07d98748527c295aaf Mon Sep 17 00:00:00 2001 From: Andrew Rogers Date: Wed, 11 Jun 2025 09:18:55 -0700 Subject: [PATCH 098/851] [llvm] annotate interfaces in llvm/TargetParser for DLL export (#143616) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Purpose This patch is one in a series of code-mods that annotate LLVM’s public interface for export. This patch annotates the `llvm/TargetParser` library. These annotations currently have no meaningful impact on the LLVM build; however, they are a prerequisite to support an LLVM Windows DLL (shared library) build. ## Background This effort is tracked in #109483. Additional context is provided in [this discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307), and documentation for `LLVM_ABI` and related annotations is found in the LLVM repo [here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst). Most of these changes were generated automatically using the [Interface Definition Scanner (IDS)](https://github.com/compnerd/ids) tool, followed formatting with `git clang-format`. Additionally, I manually removed the redundant declaration of `getCanonicalArchName` from llvm/include/llvm/TargetParser/ARMTargetParser.h because IDS only auto-annotates the first declaration it encounters, and the second un-annotated declaration results in an MSVC warning. ## Validation Local builds and tests to validate cross-platform compatibility. This included llvm, clang, and lldb on the following configurations: - Windows with MSVC - Windows with Clang - Linux with GCC - Linux with Clang - Darwin with Clang --- .../llvm/TargetParser/AArch64TargetParser.h | 60 +++++---- .../llvm/TargetParser/ARMTargetParser.h | 75 +++++------ .../llvm/TargetParser/ARMTargetParserCommon.h | 13 +- .../llvm/TargetParser/CSKYTargetParser.h | 30 +++-- llvm/include/llvm/TargetParser/Host.h | 25 ++-- .../llvm/TargetParser/LoongArchTargetParser.h | 13 +- .../llvm/TargetParser/PPCTargetParser.h | 15 ++- llvm/include/llvm/TargetParser/RISCVISAInfo.h | 42 +++--- .../llvm/TargetParser/RISCVTargetParser.h | 42 +++--- .../llvm/TargetParser/SubtargetFeature.h | 17 +-- llvm/include/llvm/TargetParser/TargetParser.h | 29 +++-- llvm/include/llvm/TargetParser/Triple.h | 121 +++++++++--------- .../llvm/TargetParser/X86TargetParser.h | 35 ++--- 13 files changed, 274 insertions(+), 243 deletions(-) diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h index 0338770593bc4..59e8117ccb730 100644 --- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h +++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h @@ -19,6 +19,7 @@ #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/VersionTuple.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TargetParser/SubtargetFeature.h" @@ -79,7 +80,7 @@ struct FMVInfo { : Name(Name), FeatureBit(FeatureBit), PriorityBit(PriorityBit), ID(ID) {}; }; -const std::vector &getFMVInfo(); +LLVM_ABI const std::vector &getFMVInfo(); // Represents a dependency between two architecture extensions. Later is the // feature which was added to the architecture after Earlier, and expands the @@ -146,7 +147,7 @@ struct ArchInfo { StringRef getSubArch() const { return ArchFeature.substr(1); } // Search for ArchInfo by SubArch name - static std::optional findBySubArch(StringRef SubArch); + LLVM_ABI static std::optional findBySubArch(StringRef SubArch); }; #define EMIT_ARCHITECTURES @@ -182,34 +183,36 @@ struct ExtensionSet { // Enable the given architecture extension, and any other extensions it // depends on. Does not change the base architecture, or follow dependencies // between features which are only related by required arcitecture versions. - void enable(ArchExtKind E); + LLVM_ABI void enable(ArchExtKind E); // Disable the given architecture extension, and any other extensions which // depend on it. Does not change the base architecture, or follow // dependencies between features which are only related by required // arcitecture versions. - void disable(ArchExtKind E); + LLVM_ABI void disable(ArchExtKind E); // Add default extensions for the given CPU. Records the base architecture, // to later resolve dependencies which depend on it. - void addCPUDefaults(const CpuInfo &CPU); + LLVM_ABI void addCPUDefaults(const CpuInfo &CPU); // Add default extensions for the given architecture version. Records the // base architecture, to later resolve dependencies which depend on it. - void addArchDefaults(const ArchInfo &Arch); + LLVM_ABI void addArchDefaults(const ArchInfo &Arch); // Add or remove a feature based on a modifier string. The string must be of // the form "" to enable a feature or "no" to disable it. This // will also enable or disable any features as required by the dependencies // between them. - bool parseModifier(StringRef Modifier, const bool AllowNoDashForm = false); + LLVM_ABI bool parseModifier(StringRef Modifier, + const bool AllowNoDashForm = false); // Constructs a new ExtensionSet by toggling the corresponding bits for every // feature in the \p Features list without expanding their dependencies. Used // for reconstructing an ExtensionSet from the output of toLLVMFeatures(). // Features that are not recognized are pushed back to \p NonExtensions. - void reconstructFromParsedFeatures(const std::vector &Features, - std::vector &NonExtensions); + LLVM_ABI void + reconstructFromParsedFeatures(const std::vector &Features, + std::vector &NonExtensions); // Convert the set of enabled extension to an LLVM feature list, appending // them to Features. @@ -227,7 +230,7 @@ struct ExtensionSet { } } - void dump() const; + LLVM_ABI void dump() const; }; // Name alias. @@ -239,52 +242,53 @@ struct Alias { #define EMIT_CPU_ALIAS #include "llvm/TargetParser/AArch64TargetParserDef.inc" -const ExtensionInfo &getExtensionByID(ArchExtKind(ExtID)); +LLVM_ABI const ExtensionInfo &getExtensionByID(ArchExtKind(ExtID)); -bool getExtensionFeatures( - const AArch64::ExtensionBitset &Extensions, - std::vector &Features); +LLVM_ABI bool getExtensionFeatures(const AArch64::ExtensionBitset &Extensions, + std::vector &Features); -StringRef getArchExtFeature(StringRef ArchExt); -StringRef resolveCPUAlias(StringRef CPU); +LLVM_ABI StringRef getArchExtFeature(StringRef ArchExt); +LLVM_ABI StringRef resolveCPUAlias(StringRef CPU); // Information by Name -const ArchInfo *getArchForCpu(StringRef CPU); +LLVM_ABI const ArchInfo *getArchForCpu(StringRef CPU); // Parser -const ArchInfo *parseArch(StringRef Arch); +LLVM_ABI const ArchInfo *parseArch(StringRef Arch); // Return the extension which has the given -target-feature name. -std::optional targetFeatureToExtension(StringRef TargetFeature); +LLVM_ABI std::optional +targetFeatureToExtension(StringRef TargetFeature); // Parse a name as defined by the Extension class in tablegen. -std::optional parseArchExtension(StringRef Extension); +LLVM_ABI std::optional parseArchExtension(StringRef Extension); // Parse a name as defined by the FMVInfo class in tablegen. -std::optional parseFMVExtension(StringRef Extension); +LLVM_ABI std::optional parseFMVExtension(StringRef Extension); // Given the name of a CPU or alias, return the correponding CpuInfo. -std::optional parseCpu(StringRef Name); +LLVM_ABI std::optional parseCpu(StringRef Name); // Used by target parser tests -void fillValidCPUArchList(SmallVectorImpl &Values); +LLVM_ABI void fillValidCPUArchList(SmallVectorImpl &Values); -bool isX18ReservedByDefault(const Triple &TT); +LLVM_ABI bool isX18ReservedByDefault(const Triple &TT); // For a given set of feature names, which can be either target-features, or // fmv-features metadata, expand their dependencies and then return a bitmask // corresponding to the entries of AArch64::FeatPriorities. -uint64_t getFMVPriority(ArrayRef Features); +LLVM_ABI uint64_t getFMVPriority(ArrayRef Features); // For a given set of FMV feature names, expand their dependencies and then // return a bitmask corresponding to the entries of AArch64::CPUFeatures. // The values in CPUFeatures are not bitmasks themselves, they are sequential // (0, 1, 2, 3, ...). The resulting bitmask is used at runtime to test whether // a certain FMV feature is available on the host. -uint64_t getCpuSupportsMask(ArrayRef Features); +LLVM_ABI uint64_t getCpuSupportsMask(ArrayRef Features); -void PrintSupportedExtensions(); +LLVM_ABI void PrintSupportedExtensions(); -void printEnabledExtensions(const std::set &EnabledFeatureNames); +LLVM_ABI void +printEnabledExtensions(const std::set &EnabledFeatureNames); } // namespace AArch64 } // namespace llvm diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.h b/llvm/include/llvm/TargetParser/ARMTargetParser.h index b2403f42f1b79..798c578ced938 100644 --- a/llvm/include/llvm/TargetParser/ARMTargetParser.h +++ b/llvm/include/llvm/TargetParser/ARMTargetParser.h @@ -17,6 +17,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/ARMBuildAttributes.h" +#include "llvm/Support/Compiler.h" #include "llvm/TargetParser/ARMTargetParserCommon.h" #include @@ -223,53 +224,55 @@ inline ArchKind &operator--(ArchKind &Kind) { } // Information by ID -StringRef getFPUName(FPUKind FPUKind); -FPUVersion getFPUVersion(FPUKind FPUKind); -NeonSupportLevel getFPUNeonSupportLevel(FPUKind FPUKind); -FPURestriction getFPURestriction(FPUKind FPUKind); - -bool getFPUFeatures(FPUKind FPUKind, std::vector &Features); -bool getHWDivFeatures(uint64_t HWDivKind, std::vector &Features); -bool getExtensionFeatures(uint64_t Extensions, - std::vector &Features); - -StringRef getArchName(ArchKind AK); -unsigned getArchAttr(ArchKind AK); -StringRef getCPUAttr(ArchKind AK); -StringRef getSubArch(ArchKind AK); -StringRef getArchExtName(uint64_t ArchExtKind); -StringRef getArchExtFeature(StringRef ArchExt); -bool appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK, StringRef ArchExt, - std::vector &Features, - FPUKind &ArgFPUKind); -ArchKind convertV9toV8(ArchKind AK); +LLVM_ABI StringRef getFPUName(FPUKind FPUKind); +LLVM_ABI FPUVersion getFPUVersion(FPUKind FPUKind); +LLVM_ABI NeonSupportLevel getFPUNeonSupportLevel(FPUKind FPUKind); +LLVM_ABI FPURestriction getFPURestriction(FPUKind FPUKind); + +LLVM_ABI bool getFPUFeatures(FPUKind FPUKind, std::vector &Features); +LLVM_ABI bool getHWDivFeatures(uint64_t HWDivKind, + std::vector &Features); +LLVM_ABI bool getExtensionFeatures(uint64_t Extensions, + std::vector &Features); + +LLVM_ABI StringRef getArchName(ArchKind AK); +LLVM_ABI unsigned getArchAttr(ArchKind AK); +LLVM_ABI StringRef getCPUAttr(ArchKind AK); +LLVM_ABI StringRef getSubArch(ArchKind AK); +LLVM_ABI StringRef getArchExtName(uint64_t ArchExtKind); +LLVM_ABI StringRef getArchExtFeature(StringRef ArchExt); +LLVM_ABI bool appendArchExtFeatures(StringRef CPU, ARM::ArchKind AK, + StringRef ArchExt, + std::vector &Features, + FPUKind &ArgFPUKind); +LLVM_ABI ArchKind convertV9toV8(ArchKind AK); // Information by Name -FPUKind getDefaultFPU(StringRef CPU, ArchKind AK); -uint64_t getDefaultExtensions(StringRef CPU, ArchKind AK); -StringRef getDefaultCPU(StringRef Arch); -StringRef getCanonicalArchName(StringRef Arch); -StringRef getFPUSynonym(StringRef FPU); +LLVM_ABI FPUKind getDefaultFPU(StringRef CPU, ArchKind AK); +LLVM_ABI uint64_t getDefaultExtensions(StringRef CPU, ArchKind AK); +LLVM_ABI StringRef getDefaultCPU(StringRef Arch); +LLVM_ABI StringRef getFPUSynonym(StringRef FPU); // Parser -uint64_t parseHWDiv(StringRef HWDiv); -FPUKind parseFPU(StringRef FPU); -ArchKind parseArch(StringRef Arch); -uint64_t parseArchExt(StringRef ArchExt); -ArchKind parseCPUArch(StringRef CPU); -ProfileKind parseArchProfile(StringRef Arch); -unsigned parseArchVersion(StringRef Arch); +LLVM_ABI uint64_t parseHWDiv(StringRef HWDiv); +LLVM_ABI FPUKind parseFPU(StringRef FPU); +LLVM_ABI ArchKind parseArch(StringRef Arch); +LLVM_ABI uint64_t parseArchExt(StringRef ArchExt); +LLVM_ABI ArchKind parseCPUArch(StringRef CPU); +LLVM_ABI ProfileKind parseArchProfile(StringRef Arch); +LLVM_ABI unsigned parseArchVersion(StringRef Arch); -void fillValidCPUArchList(SmallVectorImpl &Values); -StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU); +LLVM_ABI void fillValidCPUArchList(SmallVectorImpl &Values); +LLVM_ABI StringRef computeDefaultTargetABI(const Triple &TT, StringRef CPU); /// Get the (LLVM) name of the minimum ARM CPU for the arch we are targeting. /// /// \param Arch the architecture name (e.g., "armv7s"). If it is an empty /// string then the triple's arch name is used. -StringRef getARMCPUForArch(const llvm::Triple &Triple, StringRef MArch = {}); +LLVM_ABI StringRef getARMCPUForArch(const llvm::Triple &Triple, + StringRef MArch = {}); -void PrintSupportedExtensions(StringMap DescMap); +LLVM_ABI void PrintSupportedExtensions(StringMap DescMap); } // namespace ARM } // namespace llvm diff --git a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h index f6115718e9f5f..7c8030dd5576a 100644 --- a/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h +++ b/llvm/include/llvm/TargetParser/ARMTargetParserCommon.h @@ -14,6 +14,7 @@ #define LLVM_TARGETPARSER_ARMTARGETPARSERCOMMON_H #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" namespace llvm { namespace ARM { @@ -23,19 +24,19 @@ enum class ISAKind { INVALID = 0, ARM, THUMB, AARCH64 }; enum class EndianKind { INVALID = 0, LITTLE, BIG }; /// Converts e.g. "armv8" -> "armv8-a" -StringRef getArchSynonym(StringRef Arch); +LLVM_ABI StringRef getArchSynonym(StringRef Arch); /// MArch is expected to be of the form (arm|thumb)?(eb)?(v.+)?(eb)?, but /// (iwmmxt|xscale)(eb)? is also permitted. If the former, return /// "v.+", if the latter, return unmodified string, minus 'eb'. /// If invalid, return empty string. -StringRef getCanonicalArchName(StringRef Arch); +LLVM_ABI StringRef getCanonicalArchName(StringRef Arch); // ARM, Thumb, AArch64 -ISAKind parseArchISA(StringRef Arch); +LLVM_ABI ISAKind parseArchISA(StringRef Arch); // Little/Big endian -EndianKind parseArchEndian(StringRef Arch); +LLVM_ABI EndianKind parseArchEndian(StringRef Arch); struct ParsedBranchProtection { StringRef Scope; @@ -45,8 +46,8 @@ struct ParsedBranchProtection { bool GuardedControlStack; }; -bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP, - StringRef &Err, bool EnablePAuthLR = false); +LLVM_ABI bool parseBranchProtection(StringRef Spec, ParsedBranchProtection &PBP, + StringRef &Err, bool EnablePAuthLR = false); } // namespace ARM } // namespace llvm diff --git a/llvm/include/llvm/TargetParser/CSKYTargetParser.h b/llvm/include/llvm/TargetParser/CSKYTargetParser.h index 4c4ec06f758a8..8eab03ca01490 100644 --- a/llvm/include/llvm/TargetParser/CSKYTargetParser.h +++ b/llvm/include/llvm/TargetParser/CSKYTargetParser.h @@ -15,6 +15,7 @@ #ifndef LLVM_TARGETPARSER_CSKYTARGETPARSER_H #define LLVM_TARGETPARSER_CSKYTARGETPARSER_H +#include "llvm/Support/Compiler.h" #include "llvm/TargetParser/Triple.h" #include @@ -176,25 +177,26 @@ const ArchNames ARCHNames[] = { #include "llvm/TargetParser/CSKYTargetParser.def" }; -StringRef getArchName(ArchKind AK); -StringRef getDefaultCPU(StringRef Arch); -StringRef getArchExtName(uint64_t ArchExtKind); -StringRef getArchExtFeature(StringRef ArchExt); -uint64_t getDefaultExtensions(StringRef CPU); -bool getExtensionFeatures(uint64_t Extensions, - std::vector &Features); +LLVM_ABI StringRef getArchName(ArchKind AK); +LLVM_ABI StringRef getDefaultCPU(StringRef Arch); +LLVM_ABI StringRef getArchExtName(uint64_t ArchExtKind); +LLVM_ABI StringRef getArchExtFeature(StringRef ArchExt); +LLVM_ABI uint64_t getDefaultExtensions(StringRef CPU); +LLVM_ABI bool getExtensionFeatures(uint64_t Extensions, + std::vector &Features); // Information by ID -StringRef getFPUName(unsigned FPUKind); -FPUVersion getFPUVersion(unsigned FPUKind); +LLVM_ABI StringRef getFPUName(unsigned FPUKind); +LLVM_ABI FPUVersion getFPUVersion(unsigned FPUKind); -bool getFPUFeatures(CSKYFPUKind Kind, std::vector &Features); +LLVM_ABI bool getFPUFeatures(CSKYFPUKind Kind, + std::vector &Features); // Parser -ArchKind parseArch(StringRef Arch); -ArchKind parseCPUArch(StringRef CPU); -uint64_t parseArchExt(StringRef ArchExt); -void fillValidCPUArchList(SmallVectorImpl &Values); +LLVM_ABI ArchKind parseArch(StringRef Arch); +LLVM_ABI ArchKind parseCPUArch(StringRef CPU); +LLVM_ABI uint64_t parseArchExt(StringRef ArchExt); +LLVM_ABI void fillValidCPUArchList(SmallVectorImpl &Values); } // namespace CSKY diff --git a/llvm/include/llvm/TargetParser/Host.h b/llvm/include/llvm/TargetParser/Host.h index 443f4f583b559..be3d41e022ad9 100644 --- a/llvm/include/llvm/TargetParser/Host.h +++ b/llvm/include/llvm/TargetParser/Host.h @@ -13,6 +13,7 @@ #ifndef LLVM_TARGETPARSER_HOST_H #define LLVM_TARGETPARSER_HOST_H +#include "llvm/Support/Compiler.h" #include namespace llvm { @@ -30,18 +31,18 @@ namespace sys { /// CPU_TYPE-VENDOR-OPERATING_SYSTEM /// or /// CPU_TYPE-VENDOR-KERNEL-OPERATING_SYSTEM -std::string getDefaultTargetTriple(); +LLVM_ABI std::string getDefaultTargetTriple(); /// getProcessTriple() - Return an appropriate target triple for generating /// code to be loaded into the current process, e.g. when using the JIT. -std::string getProcessTriple(); +LLVM_ABI std::string getProcessTriple(); /// getHostCPUName - Get the LLVM name for the host CPU. The particular format /// of the name is target dependent, and suitable for passing as -mcpu to the /// target which matches the host. /// /// \return - The host CPU name, or empty if the CPU could not be determined. -StringRef getHostCPUName(); +LLVM_ABI StringRef getHostCPUName(); /// getHostCPUFeatures - Get the LLVM names for the host CPU features. /// The particular format of the names are target dependent, and suitable for @@ -52,20 +53,20 @@ StringRef getHostCPUName(); /// which features may appear in this map, except that they are all valid LLVM /// feature names. The map can be empty, for example if feature detection /// fails. -const StringMap getHostCPUFeatures(); +LLVM_ABI const StringMap getHostCPUFeatures(); /// This is a function compatible with cl::AddExtraVersionPrinter, which adds /// info about the current target triple and detected CPU. -void printDefaultTargetAndDetectedCPU(raw_ostream &OS); +LLVM_ABI void printDefaultTargetAndDetectedCPU(raw_ostream &OS); namespace detail { /// Helper functions to extract HostCPUName from /proc/cpuinfo on linux. -StringRef getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent); -StringRef getHostCPUNameForARM(StringRef ProcCpuinfoContent); -StringRef getHostCPUNameForS390x(StringRef ProcCpuinfoContent); -StringRef getHostCPUNameForRISCV(StringRef ProcCpuinfoContent); -StringRef getHostCPUNameForSPARC(StringRef ProcCpuinfoContent); -StringRef getHostCPUNameForBPF(); +LLVM_ABI StringRef getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent); +LLVM_ABI StringRef getHostCPUNameForARM(StringRef ProcCpuinfoContent); +LLVM_ABI StringRef getHostCPUNameForS390x(StringRef ProcCpuinfoContent); +LLVM_ABI StringRef getHostCPUNameForRISCV(StringRef ProcCpuinfoContent); +LLVM_ABI StringRef getHostCPUNameForSPARC(StringRef ProcCpuinfoContent); +LLVM_ABI StringRef getHostCPUNameForBPF(); /// Helper functions to extract CPU details from CPUID on x86. namespace x86 { @@ -78,7 +79,7 @@ enum class VendorSignatures { /// Returns the host CPU's vendor. /// MaxLeaf: if a non-nullptr pointer is specified, the EAX value will be /// assigned to its pointee. -VendorSignatures getVendorSignature(unsigned *MaxLeaf = nullptr); +LLVM_ABI VendorSignatures getVendorSignature(unsigned *MaxLeaf = nullptr); } // namespace x86 } // namespace detail } // namespace sys diff --git a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h index a28e4e9eff811..1357d74744592 100644 --- a/llvm/include/llvm/TargetParser/LoongArchTargetParser.h +++ b/llvm/include/llvm/TargetParser/LoongArchTargetParser.h @@ -14,6 +14,7 @@ #ifndef LLVM_TARGETPARSER_LOONGARCHTARGETPARSER_H #define LLVM_TARGETPARSER_LOONGARCHTARGETPARSER_H +#include "llvm/Support/Compiler.h" #include "llvm/TargetParser/Triple.h" #include @@ -84,12 +85,12 @@ struct ArchInfo { uint32_t Features; }; -bool isValidArchName(StringRef Arch); -bool isValidFeatureName(StringRef Feature); -bool getArchFeatures(StringRef Arch, std::vector &Features); -bool isValidCPUName(StringRef TuneCPU); -void fillValidCPUList(SmallVectorImpl &Values); -StringRef getDefaultArch(bool Is64Bit); +LLVM_ABI bool isValidArchName(StringRef Arch); +LLVM_ABI bool isValidFeatureName(StringRef Feature); +LLVM_ABI bool getArchFeatures(StringRef Arch, std::vector &Features); +LLVM_ABI bool isValidCPUName(StringRef TuneCPU); +LLVM_ABI void fillValidCPUList(SmallVectorImpl &Values); +LLVM_ABI StringRef getDefaultArch(bool Is64Bit); } // namespace LoongArch diff --git a/llvm/include/llvm/TargetParser/PPCTargetParser.h b/llvm/include/llvm/TargetParser/PPCTargetParser.h index 5f9fe543aff0b..59d9f867005a4 100644 --- a/llvm/include/llvm/TargetParser/PPCTargetParser.h +++ b/llvm/include/llvm/TargetParser/PPCTargetParser.h @@ -15,25 +15,28 @@ #define LLVM_TARGETPARSER_PPCTARGETPARSER_H #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/TargetParser/Triple.h" namespace llvm { namespace PPC { -bool isValidCPU(StringRef CPU); -void fillValidCPUList(SmallVectorImpl &Values); -void fillValidTuneCPUList(SmallVectorImpl &Values); +LLVM_ABI bool isValidCPU(StringRef CPU); +LLVM_ABI void fillValidCPUList(SmallVectorImpl &Values); +LLVM_ABI void fillValidTuneCPUList(SmallVectorImpl &Values); // Get target CPU name. // If CPUName is empty or generic, return the default CPU name. // If CPUName is not empty or generic, return the normalized CPU name. -StringRef getNormalizedPPCTargetCPU(const Triple &T, StringRef CPUName = ""); +LLVM_ABI StringRef getNormalizedPPCTargetCPU(const Triple &T, + StringRef CPUName = ""); // Get the tune CPU name. -StringRef getNormalizedPPCTuneCPU(const Triple &T, StringRef CPUName = ""); +LLVM_ABI StringRef getNormalizedPPCTuneCPU(const Triple &T, + StringRef CPUName = ""); // For PPC, there are some cpu names for same CPU, like pwr10 and power10, // normalize them. -StringRef normalizeCPUName(StringRef CPUName); +LLVM_ABI StringRef normalizeCPUName(StringRef CPUName); } // namespace PPC } // namespace llvm diff --git a/llvm/include/llvm/TargetParser/RISCVISAInfo.h b/llvm/include/llvm/TargetParser/RISCVISAInfo.h index 5b2b6f29fd3db..0c308cadba790 100644 --- a/llvm/include/llvm/TargetParser/RISCVISAInfo.h +++ b/llvm/include/llvm/TargetParser/RISCVISAInfo.h @@ -11,6 +11,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" #include "llvm/Support/RISCVISAUtils.h" @@ -31,27 +32,27 @@ class RISCVISAInfo { /// extensions with unrecognised versions will be silently dropped, except /// for the special case of the base 'i' and 'e' extensions, where the /// default version will be used (as ignoring the base is not possible). - static llvm::Expected> + LLVM_ABI static llvm::Expected> parseArchString(StringRef Arch, bool EnableExperimentalExtension, bool ExperimentalExtensionVersionCheck = true); /// Parse RISC-V ISA info from an arch string that is already in normalized /// form (as defined in the psABI). Unlike parseArchString, this function /// will not error for unrecognized extension names or extension versions. - static llvm::Expected> + LLVM_ABI static llvm::Expected> parseNormalizedArchString(StringRef Arch); /// Parse RISC-V ISA info from feature vector. - static llvm::Expected> + LLVM_ABI static llvm::Expected> parseFeatures(unsigned XLen, const std::vector &Features); - static llvm::Expected> + LLVM_ABI static llvm::Expected> createFromExtMap(unsigned XLen, const RISCVISAUtils::OrderedExtensionMap &Exts); /// Convert RISC-V ISA info to a feature vector. - std::vector toFeatures(bool AddAllExtensions = false, - bool IgnoreUnknown = true) const; + LLVM_ABI std::vector toFeatures(bool AddAllExtensions = false, + bool IgnoreUnknown = true) const; const RISCVISAUtils::OrderedExtensionMap &getExtensions() const { return Exts; @@ -64,25 +65,26 @@ class RISCVISAInfo { unsigned getMaxELen() const { return MaxELen; } unsigned getMaxELenFp() const { return MaxELenFp; } - bool hasExtension(StringRef Ext) const; - std::string toString() const; - StringRef computeDefaultABI() const; + LLVM_ABI bool hasExtension(StringRef Ext) const; + LLVM_ABI std::string toString() const; + LLVM_ABI StringRef computeDefaultABI() const; - static bool isSupportedExtensionFeature(StringRef Ext); - static bool isSupportedExtension(StringRef Ext); - static bool isSupportedExtensionWithVersion(StringRef Ext); - static bool isSupportedExtension(StringRef Ext, unsigned MajorVersion, - unsigned MinorVersion); - static std::string getTargetFeatureForExtension(StringRef Ext); + LLVM_ABI static bool isSupportedExtensionFeature(StringRef Ext); + LLVM_ABI static bool isSupportedExtension(StringRef Ext); + LLVM_ABI static bool isSupportedExtensionWithVersion(StringRef Ext); + LLVM_ABI static bool isSupportedExtension(StringRef Ext, + unsigned MajorVersion, + unsigned MinorVersion); + LLVM_ABI static std::string getTargetFeatureForExtension(StringRef Ext); - static void printSupportedExtensions(StringMap &DescMap); - static void printEnabledExtensions(bool IsRV64, - std::set &EnabledFeatureNames, - StringMap &DescMap); + LLVM_ABI static void printSupportedExtensions(StringMap &DescMap); + LLVM_ABI static void + printEnabledExtensions(bool IsRV64, std::set &EnabledFeatureNames, + StringMap &DescMap); /// Return the group id and bit position of __riscv_feature_bits. Returns /// <-1, -1> if not supported. - static std::pair getRISCVFeaturesBitsInfo(StringRef Ext); + LLVM_ABI static std::pair getRISCVFeaturesBitsInfo(StringRef Ext); // The maximum value of the group ID obtained from getRISCVFeaturesBitsInfo. static constexpr unsigned FeatureBitSize = 2; diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h index a529479b546d9..41fdab6012aa0 100644 --- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h +++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h @@ -15,6 +15,7 @@ #define LLVM_TARGETPARSER_RISCVTARGETPARSER_H #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" @@ -43,18 +44,20 @@ struct CPUInfo { static constexpr unsigned RVVBitsPerBlock = 64; static constexpr unsigned RVVBytesPerBlock = RVVBitsPerBlock / 8; -void getFeaturesForCPU(StringRef CPU, - SmallVectorImpl &EnabledFeatures, - bool NeedPlus = false); -bool parseCPU(StringRef CPU, bool IsRV64); -bool parseTuneCPU(StringRef CPU, bool IsRV64); -StringRef getMArchFromMcpu(StringRef CPU); -void fillValidCPUArchList(SmallVectorImpl &Values, bool IsRV64); -void fillValidTuneCPUArchList(SmallVectorImpl &Values, bool IsRV64); -bool hasFastScalarUnalignedAccess(StringRef CPU); -bool hasFastVectorUnalignedAccess(StringRef CPU); -bool hasValidCPUModel(StringRef CPU); -CPUModel getCPUModel(StringRef CPU); +LLVM_ABI void getFeaturesForCPU(StringRef CPU, + SmallVectorImpl &EnabledFeatures, + bool NeedPlus = false); +LLVM_ABI bool parseCPU(StringRef CPU, bool IsRV64); +LLVM_ABI bool parseTuneCPU(StringRef CPU, bool IsRV64); +LLVM_ABI StringRef getMArchFromMcpu(StringRef CPU); +LLVM_ABI void fillValidCPUArchList(SmallVectorImpl &Values, + bool IsRV64); +LLVM_ABI void fillValidTuneCPUArchList(SmallVectorImpl &Values, + bool IsRV64); +LLVM_ABI bool hasFastScalarUnalignedAccess(StringRef CPU); +LLVM_ABI bool hasFastVectorUnalignedAccess(StringRef CPU); +LLVM_ABI bool hasValidCPUModel(StringRef CPU); +LLVM_ABI CPUModel getCPUModel(StringRef CPU); } // namespace RISCV @@ -86,10 +89,10 @@ inline static bool isValidLMUL(unsigned LMUL, bool Fractional) { return isPowerOf2_32(LMUL) && LMUL <= 8 && (!Fractional || LMUL != 1); } -unsigned encodeVTYPE(VLMUL VLMUL, unsigned SEW, bool TailAgnostic, - bool MaskAgnostic); +LLVM_ABI unsigned encodeVTYPE(VLMUL VLMUL, unsigned SEW, bool TailAgnostic, + bool MaskAgnostic); -unsigned encodeXSfmmVType(unsigned SEW, unsigned Widen, bool AltFmt); +LLVM_ABI unsigned encodeXSfmmVType(unsigned SEW, unsigned Widen, bool AltFmt); inline static VLMUL getVLMUL(unsigned VType) { unsigned VLMul = VType & 0x7; @@ -97,7 +100,7 @@ inline static VLMUL getVLMUL(unsigned VType) { } // Decode VLMUL into 1,2,4,8 and fractional indicator. -std::pair decodeVLMUL(VLMUL VLMul); +LLVM_ABI std::pair decodeVLMUL(VLMUL VLMul); inline static VLMUL encodeLMUL(unsigned LMUL, bool Fractional) { assert(isValidLMUL(LMUL, Fractional) && "Unsupported LMUL"); @@ -148,11 +151,12 @@ inline static bool isMaskAgnostic(unsigned VType) { return VType & 0x80; } inline static bool isAltFmt(unsigned VType) { return VType & 0x100; } -void printVType(unsigned VType, raw_ostream &OS); +LLVM_ABI void printVType(unsigned VType, raw_ostream &OS); -unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul); +LLVM_ABI unsigned getSEWLMULRatio(unsigned SEW, VLMUL VLMul); -std::optional getSameRatioLMUL(unsigned SEW, VLMUL VLMUL, unsigned EEW); +LLVM_ABI std::optional getSameRatioLMUL(unsigned SEW, VLMUL VLMUL, + unsigned EEW); } // namespace RISCVVType } // namespace llvm diff --git a/llvm/include/llvm/TargetParser/SubtargetFeature.h b/llvm/include/llvm/TargetParser/SubtargetFeature.h index 2e1f00dad2df3..6f1723dec5d04 100644 --- a/llvm/include/llvm/TargetParser/SubtargetFeature.h +++ b/llvm/include/llvm/TargetParser/SubtargetFeature.h @@ -20,6 +20,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/MathExtras.h" #include #include @@ -175,27 +176,27 @@ class SubtargetFeatures { std::vector Features; ///< Subtarget features as a vector public: - explicit SubtargetFeatures(StringRef Initial = ""); + LLVM_ABI explicit SubtargetFeatures(StringRef Initial = ""); /// Returns features as a string. - std::string getString() const; + LLVM_ABI std::string getString() const; /// Adds Features. - void AddFeature(StringRef String, bool Enable = true); + LLVM_ABI void AddFeature(StringRef String, bool Enable = true); - void addFeaturesVector(const ArrayRef OtherFeatures); + LLVM_ABI void addFeaturesVector(const ArrayRef OtherFeatures); /// Returns the vector of individual subtarget features. const std::vector &getFeatures() const { return Features; } /// Prints feature string. - void print(raw_ostream &OS) const; + LLVM_ABI void print(raw_ostream &OS) const; // Dumps feature info. - void dump() const; + LLVM_ABI void dump() const; /// Adds the default features for the specified target triple. - void getDefaultSubtargetFeatures(const Triple& Triple); + LLVM_ABI void getDefaultSubtargetFeatures(const Triple &Triple); /// Determine if a feature has a flag; '+' or '-' static bool hasFlag(StringRef Feature) { @@ -221,7 +222,7 @@ class SubtargetFeatures { } /// Splits a string of comma separated items in to a vector of strings. - static void Split(std::vector &V, StringRef S); + LLVM_ABI static void Split(std::vector &V, StringRef S); }; } // end namespace llvm diff --git a/llvm/include/llvm/TargetParser/TargetParser.h b/llvm/include/llvm/TargetParser/TargetParser.h index f776b41f3d7ca..176205e17ae00 100644 --- a/llvm/include/llvm/TargetParser/TargetParser.h +++ b/llvm/include/llvm/TargetParser/TargetParser.h @@ -16,6 +16,7 @@ #include "llvm/ADT/StringMap.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" namespace llvm { @@ -164,27 +165,27 @@ enum FeatureError : uint32_t { UNSUPPORTED_TARGET_FEATURE }; -StringRef getArchFamilyNameAMDGCN(GPUKind AK); +LLVM_ABI StringRef getArchFamilyNameAMDGCN(GPUKind AK); -StringRef getArchNameAMDGCN(GPUKind AK); -StringRef getArchNameR600(GPUKind AK); -StringRef getCanonicalArchName(const Triple &T, StringRef Arch); -GPUKind parseArchAMDGCN(StringRef CPU); -GPUKind parseArchR600(StringRef CPU); -unsigned getArchAttrAMDGCN(GPUKind AK); -unsigned getArchAttrR600(GPUKind AK); +LLVM_ABI StringRef getArchNameAMDGCN(GPUKind AK); +LLVM_ABI StringRef getArchNameR600(GPUKind AK); +LLVM_ABI StringRef getCanonicalArchName(const Triple &T, StringRef Arch); +LLVM_ABI GPUKind parseArchAMDGCN(StringRef CPU); +LLVM_ABI GPUKind parseArchR600(StringRef CPU); +LLVM_ABI unsigned getArchAttrAMDGCN(GPUKind AK); +LLVM_ABI unsigned getArchAttrR600(GPUKind AK); -void fillValidArchListAMDGCN(SmallVectorImpl &Values); -void fillValidArchListR600(SmallVectorImpl &Values); +LLVM_ABI void fillValidArchListAMDGCN(SmallVectorImpl &Values); +LLVM_ABI void fillValidArchListR600(SmallVectorImpl &Values); -IsaVersion getIsaVersion(StringRef GPU); +LLVM_ABI IsaVersion getIsaVersion(StringRef GPU); /// Fills Features map with default values for given target GPU -void fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, - StringMap &Features); +LLVM_ABI void fillAMDGPUFeatureMap(StringRef GPU, const Triple &T, + StringMap &Features); /// Inserts wave size feature for given GPU into features map -std::pair +LLVM_ABI std::pair insertWaveSizeFeature(StringRef GPU, const Triple &T, StringMap &Features); diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h index b56e6e18805e0..b6f15ef13191f 100644 --- a/llvm/include/llvm/TargetParser/Triple.h +++ b/llvm/include/llvm/TargetParser/Triple.h @@ -10,6 +10,7 @@ #define LLVM_TARGETPARSER_TRIPLE_H #include "llvm/ADT/Twine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/VersionTuple.h" // Some system headers or GCC predefined macros conflict with identifiers in @@ -348,10 +349,11 @@ class Triple { /// triple fields unknown. Triple() = default; - explicit Triple(const Twine &Str); - Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr); - Triple(const Twine &ArchStr, const Twine &VendorStr, const Twine &OSStr, - const Twine &EnvironmentStr); + LLVM_ABI explicit Triple(const Twine &Str); + LLVM_ABI Triple(const Twine &ArchStr, const Twine &VendorStr, + const Twine &OSStr); + LLVM_ABI Triple(const Twine &ArchStr, const Twine &VendorStr, + const Twine &OSStr, const Twine &EnvironmentStr); bool operator==(const Triple &Other) const { return Arch == Other.Arch && SubArch == Other.SubArch && @@ -381,8 +383,8 @@ class Triple { /// reasonably be done). In particular, it handles the common case in which /// otherwise valid components are in the wrong order. \p Form is used to /// specify the output canonical form. - static std::string normalize(StringRef Str, - CanonicalForm Form = CanonicalForm::ANY); + LLVM_ABI static std::string + normalize(StringRef Str, CanonicalForm Form = CanonicalForm::ANY); /// Return the normalized form of this triple's string. std::string normalize(CanonicalForm Form = CanonicalForm::ANY) const { @@ -417,7 +419,7 @@ class Triple { /// triple, if present. /// /// For example, "fooos1.2.3" would return (1, 2, 3). - VersionTuple getEnvironmentVersion() const; + LLVM_ABI VersionTuple getEnvironmentVersion() const; /// Get the object format for this triple. ObjectFormatType getObjectFormat() const { return ObjectFormat; } @@ -426,7 +428,7 @@ class Triple { /// present. /// /// For example, "fooos1.2.3" would return (1, 2, 3). - VersionTuple getOSVersion() const; + LLVM_ABI VersionTuple getOSVersion() const; /// Return just the major version number, this is specialized because it is a /// common query. @@ -436,26 +438,26 @@ class Triple { /// "darwin" versions to the corresponding OS X versions. This may also be /// called with IOS triples but the OS X version number is just set to a /// constant 10.4.0 in that case. Returns true if successful. - bool getMacOSXVersion(VersionTuple &Version) const; + LLVM_ABI bool getMacOSXVersion(VersionTuple &Version) const; /// Parse the version number as with getOSVersion. This should only be called /// with IOS or generic triples. - VersionTuple getiOSVersion() const; + LLVM_ABI VersionTuple getiOSVersion() const; /// Parse the version number as with getOSVersion. This should only be called /// with WatchOS or generic triples. - VersionTuple getWatchOSVersion() const; + LLVM_ABI VersionTuple getWatchOSVersion() const; /// Parse the version number as with getOSVersion. - VersionTuple getDriverKitVersion() const; + LLVM_ABI VersionTuple getDriverKitVersion() const; /// Parse the Vulkan version number from the OSVersion and SPIR-V version /// (SubArch). This should only be called with Vulkan SPIR-V triples. - VersionTuple getVulkanVersion() const; + LLVM_ABI VersionTuple getVulkanVersion() const; /// Parse the DXIL version number from the OSVersion and DXIL version /// (SubArch). This should only be called with DXIL triples. - VersionTuple getDXILVersion() const; + LLVM_ABI VersionTuple getDXILVersion() const; /// @} /// @name Direct Component Access @@ -469,34 +471,34 @@ class Triple { bool empty() const { return Data.empty(); } /// Get the architecture (first) component of the triple. - StringRef getArchName() const; + LLVM_ABI StringRef getArchName() const; /// Get the vendor (second) component of the triple. - StringRef getVendorName() const; + LLVM_ABI StringRef getVendorName() const; /// Get the operating system (third) component of the triple. - StringRef getOSName() const; + LLVM_ABI StringRef getOSName() const; /// Get the optional environment (fourth) component of the triple, or "" if /// empty. - StringRef getEnvironmentName() const; + LLVM_ABI StringRef getEnvironmentName() const; /// Get the operating system and optional environment components as a single /// string (separated by a '-' if the environment component is present). - StringRef getOSAndEnvironmentName() const; + LLVM_ABI StringRef getOSAndEnvironmentName() const; /// Get the version component of the environment component as a single /// string (the version after the environment). /// /// For example, "fooos1.2.3" would return "1.2.3". - StringRef getEnvironmentVersionString() const; + LLVM_ABI StringRef getEnvironmentVersionString() const; /// @} /// @name Convenience Predicates /// @{ /// Returns the pointer width of this architecture. - static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch); + LLVM_ABI static unsigned getArchPointerBitWidth(llvm::Triple::ArchType Arch); /// Returns the pointer width of this architecture. unsigned getArchPointerBitWidth() const { @@ -504,7 +506,7 @@ class Triple { } /// Returns the trampoline size in bytes for this configuration. - unsigned getTrampolineSize() const; + LLVM_ABI unsigned getTrampolineSize() const; /// Test whether the architecture is 64-bit /// @@ -513,17 +515,17 @@ class Triple { /// 16-bit. The inner details of pointer width for particular architectures /// is not summed up in the triple, and so only a coarse grained predicate /// system is provided. - bool isArch64Bit() const; + LLVM_ABI bool isArch64Bit() const; /// Test whether the architecture is 32-bit /// /// Note that this tests for 32-bit pointer width, and nothing else. - bool isArch32Bit() const; + LLVM_ABI bool isArch32Bit() const; /// Test whether the architecture is 16-bit /// /// Note that this tests for 16-bit pointer width, and nothing else. - bool isArch16Bit() const; + LLVM_ABI bool isArch16Bit() const; /// Helper function for doing comparisons against version numbers included in /// the target triple. @@ -544,8 +546,8 @@ class Triple { /// Comparison function for checking OS X version compatibility, which handles /// supporting skewed version numbering schemes used by the "darwin" triples. - bool isMacOSXVersionLT(unsigned Major, unsigned Minor = 0, - unsigned Micro = 0) const; + LLVM_ABI bool isMacOSXVersionLT(unsigned Major, unsigned Minor = 0, + unsigned Micro = 0) const; /// Is this a Mac OS X triple. For legacy reasons, we support both "darwin" /// and "osx" as OS X triples. @@ -1171,38 +1173,38 @@ class Triple { /// @{ /// Set the architecture (first) component of the triple to a known type. - void setArch(ArchType Kind, SubArchType SubArch = NoSubArch); + LLVM_ABI void setArch(ArchType Kind, SubArchType SubArch = NoSubArch); /// Set the vendor (second) component of the triple to a known type. - void setVendor(VendorType Kind); + LLVM_ABI void setVendor(VendorType Kind); /// Set the operating system (third) component of the triple to a known type. - void setOS(OSType Kind); + LLVM_ABI void setOS(OSType Kind); /// Set the environment (fourth) component of the triple to a known type. - void setEnvironment(EnvironmentType Kind); + LLVM_ABI void setEnvironment(EnvironmentType Kind); /// Set the object file format. - void setObjectFormat(ObjectFormatType Kind); + LLVM_ABI void setObjectFormat(ObjectFormatType Kind); /// Set all components to the new triple \p Str. - void setTriple(const Twine &Str); + LLVM_ABI void setTriple(const Twine &Str); /// Set the architecture (first) component of the triple by name. - void setArchName(StringRef Str); + LLVM_ABI void setArchName(StringRef Str); /// Set the vendor (second) component of the triple by name. - void setVendorName(StringRef Str); + LLVM_ABI void setVendorName(StringRef Str); /// Set the operating system (third) component of the triple by name. - void setOSName(StringRef Str); + LLVM_ABI void setOSName(StringRef Str); /// Set the optional environment (fourth) component of the triple by name. - void setEnvironmentName(StringRef Str); + LLVM_ABI void setEnvironmentName(StringRef Str); /// Set the operating system and optional environment components with a single /// string. - void setOSAndEnvironmentName(StringRef Str); + LLVM_ABI void setOSAndEnvironmentName(StringRef Str); /// @} /// @name Helpers to build variants of a particular triple. @@ -1214,7 +1216,7 @@ class Triple { /// /// \returns A new triple with a 32-bit architecture or an unknown /// architecture if no such variant can be found. - llvm::Triple get32BitArchVariant() const; + LLVM_ABI llvm::Triple get32BitArchVariant() const; /// Form a triple with a 64-bit variant of the current architecture. /// @@ -1222,7 +1224,7 @@ class Triple { /// /// \returns A new triple with a 64-bit architecture or an unknown /// architecture if no such variant can be found. - llvm::Triple get64BitArchVariant() const; + LLVM_ABI llvm::Triple get64BitArchVariant() const; /// Form a triple with a big endian variant of the current architecture. /// @@ -1230,7 +1232,7 @@ class Triple { /// /// \returns A new triple with a big endian architecture or an unknown /// architecture if no such variant can be found. - llvm::Triple getBigEndianArchVariant() const; + LLVM_ABI llvm::Triple getBigEndianArchVariant() const; /// Form a triple with a little endian variant of the current architecture. /// @@ -1238,73 +1240,76 @@ class Triple { /// /// \returns A new triple with a little endian architecture or an unknown /// architecture if no such variant can be found. - llvm::Triple getLittleEndianArchVariant() const; + LLVM_ABI llvm::Triple getLittleEndianArchVariant() const; /// Tests whether the target triple is little endian. /// /// \returns true if the triple is little endian, false otherwise. - bool isLittleEndian() const; + LLVM_ABI bool isLittleEndian() const; /// Test whether target triples are compatible. - bool isCompatibleWith(const Triple &Other) const; + LLVM_ABI bool isCompatibleWith(const Triple &Other) const; /// Test whether the target triple is for a GPU. bool isGPU() const { return isSPIRV() || isNVPTX() || isAMDGPU(); } /// Merge target triples. - std::string merge(const Triple &Other) const; + LLVM_ABI std::string merge(const Triple &Other) const; /// Some platforms have different minimum supported OS versions that /// varies by the architecture specified in the triple. This function /// returns the minimum supported OS version for this triple if one an exists, /// or an invalid version tuple if this triple doesn't have one. - VersionTuple getMinimumSupportedOSVersion() const; + LLVM_ABI VersionTuple getMinimumSupportedOSVersion() const; /// @} /// @name Static helpers for IDs. /// @{ /// Get the canonical name for the \p Kind architecture. - static StringRef getArchTypeName(ArchType Kind); + LLVM_ABI static StringRef getArchTypeName(ArchType Kind); /// Get the architecture name based on \p Kind and \p SubArch. - static StringRef getArchName(ArchType Kind, SubArchType SubArch = NoSubArch); + LLVM_ABI static StringRef getArchName(ArchType Kind, + SubArchType SubArch = NoSubArch); /// Get the "prefix" canonical name for the \p Kind architecture. This is the /// prefix used by the architecture specific builtins, and is suitable for /// passing to \see Intrinsic::getIntrinsicForClangBuiltin(). /// /// \return - The architecture prefix, or 0 if none is defined. - static StringRef getArchTypePrefix(ArchType Kind); + LLVM_ABI static StringRef getArchTypePrefix(ArchType Kind); /// Get the canonical name for the \p Kind vendor. - static StringRef getVendorTypeName(VendorType Kind); + LLVM_ABI static StringRef getVendorTypeName(VendorType Kind); /// Get the canonical name for the \p Kind operating system. - static StringRef getOSTypeName(OSType Kind); + LLVM_ABI static StringRef getOSTypeName(OSType Kind); /// Get the canonical name for the \p Kind environment. - static StringRef getEnvironmentTypeName(EnvironmentType Kind); + LLVM_ABI static StringRef getEnvironmentTypeName(EnvironmentType Kind); /// Get the name for the \p Object format. - static StringRef getObjectFormatTypeName(ObjectFormatType ObjectFormat); + LLVM_ABI static StringRef + getObjectFormatTypeName(ObjectFormatType ObjectFormat); /// @} /// @name Static helpers for converting alternate architecture names. /// @{ /// The canonical type for the given LLVM architecture name (e.g., "x86"). - static ArchType getArchTypeForLLVMName(StringRef Str); + LLVM_ABI static ArchType getArchTypeForLLVMName(StringRef Str); /// @} /// Returns a canonicalized OS version number for the specified OS. - static VersionTuple getCanonicalVersionForOS(OSType OSKind, - const VersionTuple &Version, - bool IsInValidRange); + LLVM_ABI static VersionTuple + getCanonicalVersionForOS(OSType OSKind, const VersionTuple &Version, + bool IsInValidRange); /// Returns whether an OS version is invalid and would not map to an Apple OS. - static bool isValidVersionForOS(OSType OSKind, const VersionTuple &Version); + LLVM_ABI static bool isValidVersionForOS(OSType OSKind, + const VersionTuple &Version); }; } // End llvm namespace diff --git a/llvm/include/llvm/TargetParser/X86TargetParser.h b/llvm/include/llvm/TargetParser/X86TargetParser.h index 8447aca7bb92a..f6aeaada346e7 100644 --- a/llvm/include/llvm/TargetParser/X86TargetParser.h +++ b/llvm/include/llvm/TargetParser/X86TargetParser.h @@ -15,6 +15,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/StringMap.h" +#include "llvm/Support/Compiler.h" #include namespace llvm { @@ -153,34 +154,36 @@ enum CPUKind { /// Parse \p CPU string into a CPUKind. Will only accept 64-bit capable CPUs if /// \p Only64Bit is true. -CPUKind parseArchX86(StringRef CPU, bool Only64Bit = false); -CPUKind parseTuneCPU(StringRef CPU, bool Only64Bit = false); +LLVM_ABI CPUKind parseArchX86(StringRef CPU, bool Only64Bit = false); +LLVM_ABI CPUKind parseTuneCPU(StringRef CPU, bool Only64Bit = false); /// Provide a list of valid CPU names. If \p Only64Bit is true, the list will /// only contain 64-bit capable CPUs. -void fillValidCPUArchList(SmallVectorImpl &Values, - bool Only64Bit = false); +LLVM_ABI void fillValidCPUArchList(SmallVectorImpl &Values, + bool Only64Bit = false); /// Provide a list of valid -mtune names. -void fillValidTuneCPUList(SmallVectorImpl &Values, - bool Only64Bit = false); +LLVM_ABI void fillValidTuneCPUList(SmallVectorImpl &Values, + bool Only64Bit = false); /// Get the key feature prioritizing target multiversioning. -ProcessorFeatures getKeyFeature(CPUKind Kind); +LLVM_ABI ProcessorFeatures getKeyFeature(CPUKind Kind); /// Fill in the features that \p CPU supports into \p Features. /// "+" will be append in front of each feature if NeedPlus is true. -void getFeaturesForCPU(StringRef CPU, SmallVectorImpl &Features, - bool NeedPlus = false); +LLVM_ABI void getFeaturesForCPU(StringRef CPU, + SmallVectorImpl &Features, + bool NeedPlus = false); /// Set or clear entries in \p Features that are implied to be enabled/disabled /// by the provided \p Feature. -void updateImpliedFeatures(StringRef Feature, bool Enabled, - StringMap &Features); - -char getCPUDispatchMangling(StringRef Name); -bool validateCPUSpecificCPUDispatch(StringRef Name); -std::array getCpuSupportsMask(ArrayRef FeatureStrs); -unsigned getFeaturePriority(ProcessorFeatures Feat); +LLVM_ABI void updateImpliedFeatures(StringRef Feature, bool Enabled, + StringMap &Features); + +LLVM_ABI char getCPUDispatchMangling(StringRef Name); +LLVM_ABI bool validateCPUSpecificCPUDispatch(StringRef Name); +LLVM_ABI std::array +getCpuSupportsMask(ArrayRef FeatureStrs); +LLVM_ABI unsigned getFeaturePriority(ProcessorFeatures Feat); } // namespace X86 } // namespace llvm From 8f8ed23c6247e9c1dd2df4494930813b353c52c4 Mon Sep 17 00:00:00 2001 From: Andrew Rogers Date: Wed, 11 Jun 2025 09:19:13 -0700 Subject: [PATCH 099/851] [llvm] annotate interfaces in llvm/SandboxIR for DLL export (#142863) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Purpose This patch is one in a series of code-mods that annotate LLVM’s public interface for export. This patch annotates the `llvm/SandboxIR` library. These annotations currently have no meaningful impact on the LLVM build; however, they are a prerequisite to support an LLVM Windows DLL (shared library) build. ## Background This effort is tracked in #109483. Additional context is provided in [this discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307), and documentation for `LLVM_ABI` and related annotations is found in the LLVM repo [here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst). The bulk of these changes were generated automatically using the [Interface Definition Scanner (IDS)](https://github.com/compnerd/ids) tool, followed formatting with `git clang-format`. The following manual adjustments were also applied after running IDS on Linux: - Remove explicit `GlobalWithNodeAPI::LLVMGVToGV::operator()` template function instantiations that were previously added for the dylib build. Instead, directly annotate the `LLVMGVToGV::operator()` method with `LLVM_ABI`. This is done so the DLL build works with both MSVC and clang-cl. - Explicitly `#include "llvm/SandboxIR/Value.h"` in `Tracker.h` so that the symbol is available for exported templates in this file. These templates get fully instantiated on DLL export, so they require the full definition of `Value`. - Add extern template instantiation declarations for `GlobalWithNodeAPI` template types in `Constants.h` and annotate them with `LLVM_TEMPLATE_ABI`. - Add `LLVM_EXPORT_TEMPLATE` to `GlobalWithNodeAPI` template instantiations in `Constants.cpp`. ## Validation Local builds and tests to validate cross-platform compatibility. This included llvm, clang, and lldb on the following configurations: - Windows with MSVC - Windows with Clang - Linux with GCC - Linux with Clang - Darwin with Clang --- llvm/include/llvm/SandboxIR/BasicBlock.h | 21 +- llvm/include/llvm/SandboxIR/Constant.h | 207 ++++---- llvm/include/llvm/SandboxIR/Context.h | 135 +++--- llvm/include/llvm/SandboxIR/Function.h | 5 +- llvm/include/llvm/SandboxIR/Instruction.h | 547 +++++++++++----------- llvm/include/llvm/SandboxIR/Module.h | 10 +- llvm/include/llvm/SandboxIR/PassManager.h | 6 +- llvm/include/llvm/SandboxIR/Region.h | 19 +- llvm/include/llvm/SandboxIR/Tracker.h | 34 +- llvm/include/llvm/SandboxIR/Type.h | 53 ++- llvm/include/llvm/SandboxIR/Use.h | 9 +- llvm/include/llvm/SandboxIR/User.h | 13 +- llvm/include/llvm/SandboxIR/Value.h | 20 +- llvm/lib/SandboxIR/Constant.cpp | 37 +- 14 files changed, 565 insertions(+), 551 deletions(-) diff --git a/llvm/include/llvm/SandboxIR/BasicBlock.h b/llvm/include/llvm/SandboxIR/BasicBlock.h index 93e79e2a421f9..25bbb6c058faa 100644 --- a/llvm/include/llvm/SandboxIR/BasicBlock.h +++ b/llvm/include/llvm/SandboxIR/BasicBlock.h @@ -11,6 +11,7 @@ #include "llvm/IR/BasicBlock.h" #include "llvm/SandboxIR/Value.h" +#include "llvm/Support/Compiler.h" namespace llvm::sandboxir { @@ -32,20 +33,20 @@ class BBIterator { llvm::BasicBlock *BB; llvm::BasicBlock::iterator It; Context *Ctx; - pointer getInstr(llvm::BasicBlock::iterator It) const; + LLVM_ABI pointer getInstr(llvm::BasicBlock::iterator It) const; public: BBIterator() : BB(nullptr), Ctx(nullptr) {} BBIterator(llvm::BasicBlock *BB, llvm::BasicBlock::iterator It, Context *Ctx) : BB(BB), It(It), Ctx(Ctx) {} reference operator*() const { return *getInstr(It); } - BBIterator &operator++(); + LLVM_ABI BBIterator &operator++(); BBIterator operator++(int) { auto Copy = *this; ++*this; return Copy; } - BBIterator &operator--(); + LLVM_ABI BBIterator &operator--(); BBIterator operator--(int) { auto Copy = *this; --*this; @@ -60,14 +61,14 @@ class BBIterator { /// the instruction is not found in the IR-to-SandboxIR tables. pointer get() const { return getInstr(It); } /// \Returns the parent BB. - BasicBlock *getNodeParent() const; + LLVM_ABI BasicBlock *getNodeParent() const; }; /// Contains a list of sandboxir::Instruction's. class BasicBlock : public Value { /// Builds a graph that contains all values in \p BB in their original form /// i.e., no vectorization is taking place here. - void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB); + LLVM_ABI void buildBasicBlockFromLLVMIR(llvm::BasicBlock *LLVMBB); friend class Context; // For `buildBasicBlockFromIR` friend class Instruction; // For LLVM Val. @@ -82,9 +83,9 @@ class BasicBlock : public Value { static bool classof(const Value *From) { return From->getSubclassID() == Value::ClassID::Block; } - Function *getParent() const; + LLVM_ABI Function *getParent() const; using iterator = BBIterator; - iterator begin() const; + LLVM_ABI iterator begin() const; iterator end() const { auto *BB = cast(Val); return iterator(BB, BB->end(), &Ctx); @@ -96,10 +97,10 @@ class BasicBlock : public Value { return std::make_reverse_iterator(begin()); } Context &getContext() const { return Ctx; } - Instruction *getTerminator() const; + LLVM_ABI Instruction *getTerminator() const; bool empty() const { return begin() == end(); } - Instruction &front() const; - Instruction &back() const; + LLVM_ABI Instruction &front() const; + LLVM_ABI Instruction &back() const; #ifndef NDEBUG void verify() const final; diff --git a/llvm/include/llvm/SandboxIR/Constant.h b/llvm/include/llvm/SandboxIR/Constant.h index e7b18a442d330..6f682a7059d10 100644 --- a/llvm/include/llvm/SandboxIR/Constant.h +++ b/llvm/include/llvm/SandboxIR/Constant.h @@ -76,16 +76,16 @@ class ConstantInt : public Constant { } public: - static ConstantInt *getTrue(Context &Ctx); - static ConstantInt *getFalse(Context &Ctx); - static ConstantInt *getBool(Context &Ctx, bool V); - static Constant *getTrue(Type *Ty); - static Constant *getFalse(Type *Ty); - static Constant *getBool(Type *Ty, bool V); + LLVM_ABI static ConstantInt *getTrue(Context &Ctx); + LLVM_ABI static ConstantInt *getFalse(Context &Ctx); + LLVM_ABI static ConstantInt *getBool(Context &Ctx, bool V); + LLVM_ABI static Constant *getTrue(Type *Ty); + LLVM_ABI static Constant *getFalse(Type *Ty); + LLVM_ABI static Constant *getBool(Type *Ty, bool V); /// If Ty is a vector type, return a Constant with a splat of the given /// value. Otherwise return a ConstantInt for the given value. - static ConstantInt *get(Type *Ty, uint64_t V, bool IsSigned = false); + LLVM_ABI static ConstantInt *get(Type *Ty, uint64_t V, bool IsSigned = false); /// Return a ConstantInt with the specified integer value for the specified /// type. If the type is wider than 64 bits, the value will be zero-extended @@ -93,27 +93,29 @@ class ConstantInt : public Constant { /// be interpreted as a 64-bit signed integer and sign-extended to fit /// the type. /// Get a ConstantInt for a specific value. - static ConstantInt *get(IntegerType *Ty, uint64_t V, bool IsSigned = false); + LLVM_ABI static ConstantInt *get(IntegerType *Ty, uint64_t V, + bool IsSigned = false); /// Return a ConstantInt with the specified value for the specified type. The /// value V will be canonicalized to a an unsigned APInt. Accessing it with /// either getSExtValue() or getZExtValue() will yield a correctly sized and /// signed value for the type Ty. /// Get a ConstantInt for a specific signed value. - static ConstantInt *getSigned(IntegerType *Ty, int64_t V); - static Constant *getSigned(Type *Ty, int64_t V); + LLVM_ABI static ConstantInt *getSigned(IntegerType *Ty, int64_t V); + LLVM_ABI static Constant *getSigned(Type *Ty, int64_t V); /// Return a ConstantInt with the specified value and an implied Type. The /// type is the integer type that corresponds to the bit width of the value. - static ConstantInt *get(Context &Ctx, const APInt &V); + LLVM_ABI static ConstantInt *get(Context &Ctx, const APInt &V); /// Return a ConstantInt constructed from the string strStart with the given /// radix. - static ConstantInt *get(IntegerType *Ty, StringRef Str, uint8_t Radix); + LLVM_ABI static ConstantInt *get(IntegerType *Ty, StringRef Str, + uint8_t Radix); /// If Ty is a vector type, return a Constant with a splat of the given /// value. Otherwise return a ConstantInt for the given value. - static Constant *get(Type *Ty, const APInt &V); + LLVM_ABI static Constant *get(Type *Ty, const APInt &V); /// Return the constant as an APInt value reference. This allows clients to /// obtain a full-precision copy of the value. @@ -166,7 +168,7 @@ class ConstantInt : public Constant { /// Variant of the getType() method to always return an IntegerType, which /// reduces the amount of casting needed in parts of the compiler. - IntegerType *getIntegerType() const; + LLVM_ABI IntegerType *getIntegerType() const; /// This static method returns true if the type Ty is big enough to /// represent the value V. This can be used to avoid having the get method @@ -177,8 +179,8 @@ class ConstantInt : public Constant { /// to the appropriate unsigned type before calling the method. /// @returns true if V is a valid value for type Ty /// Determine if the value is in range for the given type. - static bool isValueValidForType(Type *Ty, uint64_t V); - static bool isValueValidForType(Type *Ty, int64_t V); + LLVM_ABI static bool isValueValidForType(Type *Ty, uint64_t V); + LLVM_ABI static bool isValueValidForType(Type *Ty, int64_t V); bool isNegative() const { return cast(Val)->isNegative(); } @@ -264,29 +266,29 @@ class ConstantFP final : public Constant { /// for the specified value in the specified type. This should only be used /// for simple constant values like 2.0/1.0 etc, that are known-valid both as /// host double and as the target format. - static Constant *get(Type *Ty, double V); + LLVM_ABI static Constant *get(Type *Ty, double V); /// If Ty is a vector type, return a Constant with a splat of the given /// value. Otherwise return a ConstantFP for the given value. - static Constant *get(Type *Ty, const APFloat &V); + LLVM_ABI static Constant *get(Type *Ty, const APFloat &V); - static Constant *get(Type *Ty, StringRef Str); + LLVM_ABI static Constant *get(Type *Ty, StringRef Str); - static ConstantFP *get(const APFloat &V, Context &Ctx); + LLVM_ABI static ConstantFP *get(const APFloat &V, Context &Ctx); - static Constant *getNaN(Type *Ty, bool Negative = false, - uint64_t Payload = 0); - static Constant *getQNaN(Type *Ty, bool Negative = false, - APInt *Payload = nullptr); - static Constant *getSNaN(Type *Ty, bool Negative = false, - APInt *Payload = nullptr); - static Constant *getZero(Type *Ty, bool Negative = false); + LLVM_ABI static Constant *getNaN(Type *Ty, bool Negative = false, + uint64_t Payload = 0); + LLVM_ABI static Constant *getQNaN(Type *Ty, bool Negative = false, + APInt *Payload = nullptr); + LLVM_ABI static Constant *getSNaN(Type *Ty, bool Negative = false, + APInt *Payload = nullptr); + LLVM_ABI static Constant *getZero(Type *Ty, bool Negative = false); - static Constant *getNegativeZero(Type *Ty); - static Constant *getInfinity(Type *Ty, bool Negative = false); + LLVM_ABI static Constant *getNegativeZero(Type *Ty); + LLVM_ABI static Constant *getInfinity(Type *Ty, bool Negative = false); /// Return true if Ty is big enough to represent V. - static bool isValueValidForType(Type *Ty, const APFloat &V); + LLVM_ABI static bool isValueValidForType(Type *Ty, const APFloat &V); inline const APFloat &getValueAPF() const { return cast(Val)->getValueAPF(); @@ -362,8 +364,8 @@ class ConstantArray final : public ConstantAggregate { friend class Context; // For constructor. public: - static Constant *get(ArrayType *T, ArrayRef V); - ArrayType *getType() const; + LLVM_ABI static Constant *get(ArrayType *T, ArrayRef V); + LLVM_ABI ArrayType *getType() const; // TODO: Missing functions: getType(), getTypeForElements(), getAnon(), get(). @@ -379,7 +381,7 @@ class ConstantStruct final : public ConstantAggregate { friend class Context; // For constructor. public: - static Constant *get(StructType *T, ArrayRef V); + LLVM_ABI static Constant *get(StructType *T, ArrayRef V); template static std::enable_if_t::value, Constant *> @@ -396,8 +398,8 @@ class ConstantStruct final : public ConstantAggregate { return get(getTypeForElements(Ctx, V, Packed), V); } /// This version of the method allows an empty list. - static StructType *getTypeForElements(Context &Ctx, ArrayRef V, - bool Packed = false); + LLVM_ABI static StructType * + getTypeForElements(Context &Ctx, ArrayRef V, bool Packed = false); /// Return an anonymous struct type to use for a constant with the specified /// set of elements. The list must not be empty. static StructType *getTypeForElements(ArrayRef V, @@ -424,10 +426,10 @@ class ConstantVector final : public ConstantAggregate { friend class Context; // For constructor. public: - static Constant *get(ArrayRef V); + LLVM_ABI static Constant *get(ArrayRef V); /// Return a ConstantVector with the specified constant in each element. /// Note that this might not return an instance of ConstantVector - static Constant *getSplat(ElementCount EC, Constant *Elt); + LLVM_ABI static Constant *getSplat(ElementCount EC, Constant *Elt); /// Specialize the getType() method to always return a FixedVectorType, /// which reduces the amount of casting needed in parts of the compiler. inline FixedVectorType *getType() const { @@ -436,7 +438,7 @@ class ConstantVector final : public ConstantAggregate { /// If all elements of the vector constant have the same value, return that /// value. Otherwise, return nullptr. Ignore poison elements by setting /// AllowPoison to true. - Constant *getSplatValue(bool AllowPoison = false) const; + LLVM_ABI Constant *getSplatValue(bool AllowPoison = false) const; /// For isa/dyn_cast. static bool classof(const Value *From) { @@ -451,18 +453,18 @@ class ConstantAggregateZero final : public Constant { friend class Context; // For constructor. public: - static ConstantAggregateZero *get(Type *Ty); + LLVM_ABI static ConstantAggregateZero *get(Type *Ty); /// If this CAZ has array or vector type, return a zero with the right element /// type. - Constant *getSequentialElement() const; + LLVM_ABI Constant *getSequentialElement() const; /// If this CAZ has struct type, return a zero with the right element type for /// the specified element. - Constant *getStructElement(unsigned Elt) const; + LLVM_ABI Constant *getStructElement(unsigned Elt) const; /// Return a zero of the right value for the specified GEP index if we can, /// otherwise return null (e.g. if C is a ConstantExpr). - Constant *getElementValue(Constant *C) const; + LLVM_ABI Constant *getElementValue(Constant *C) const; /// Return a zero of the right value for the specified GEP index. - Constant *getElementValue(unsigned Idx) const; + LLVM_ABI Constant *getElementValue(unsigned Idx) const; /// Return the number of elements in the array, vector, or struct. ElementCount getElementCount() const { return cast(Val)->getElementCount(); @@ -769,9 +771,9 @@ class ConstantPointerNull final : public Constant { friend class Context; // For constructor. public: - static ConstantPointerNull *get(PointerType *Ty); + LLVM_ABI static ConstantPointerNull *get(PointerType *Ty); - PointerType *getType() const; + LLVM_ABI PointerType *getType() const; /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From) { @@ -802,22 +804,22 @@ class UndefValue : public Constant { public: /// Static factory methods - Return an 'undef' object of the specified type. - static UndefValue *get(Type *T); + LLVM_ABI static UndefValue *get(Type *T); /// If this Undef has array or vector type, return a undef with the right /// element type. - UndefValue *getSequentialElement() const; + LLVM_ABI UndefValue *getSequentialElement() const; /// If this undef has struct type, return a undef with the right element type /// for the specified element. - UndefValue *getStructElement(unsigned Elt) const; + LLVM_ABI UndefValue *getStructElement(unsigned Elt) const; /// Return an undef of the right value for the specified GEP index if we can, /// otherwise return null (e.g. if C is a ConstantExpr). - UndefValue *getElementValue(Constant *C) const; + LLVM_ABI UndefValue *getElementValue(Constant *C) const; /// Return an undef of the right value for the specified GEP index. - UndefValue *getElementValue(unsigned Idx) const; + LLVM_ABI UndefValue *getElementValue(unsigned Idx) const; /// Return the number of elements in the array, vector, or struct. unsigned getNumElements() const { @@ -850,22 +852,22 @@ class PoisonValue final : public UndefValue { public: /// Static factory methods - Return an 'poison' object of the specified type. - static PoisonValue *get(Type *T); + LLVM_ABI static PoisonValue *get(Type *T); /// If this poison has array or vector type, return a poison with the right /// element type. - PoisonValue *getSequentialElement() const; + LLVM_ABI PoisonValue *getSequentialElement() const; /// If this poison has struct type, return a poison with the right element /// type for the specified element. - PoisonValue *getStructElement(unsigned Elt) const; + LLVM_ABI PoisonValue *getStructElement(unsigned Elt) const; /// Return an poison of the right value for the specified GEP index if we can, /// otherwise return null (e.g. if C is a ConstantExpr). - PoisonValue *getElementValue(Constant *C) const; + LLVM_ABI PoisonValue *getElementValue(Constant *C) const; /// Return an poison of the right value for the specified GEP index. - PoisonValue *getElementValue(unsigned Idx) const; + LLVM_ABI PoisonValue *getElementValue(unsigned Idx) const; /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From) { @@ -924,7 +926,7 @@ class GlobalValue : public Constant { UnnamedAddr getUnnamedAddr() const { return cast(Val)->getUnnamedAddr(); } - void setUnnamedAddr(UnnamedAddr V); + LLVM_ABI void setUnnamedAddr(UnnamedAddr V); static UnnamedAddr getMinUnnamedAddr(UnnamedAddr A, UnnamedAddr B) { return llvm::GlobalValue::getMinUnnamedAddr(A, B); @@ -946,7 +948,7 @@ class GlobalValue : public Constant { bool hasProtectedVisibility() const { return cast(Val)->hasProtectedVisibility(); } - void setVisibility(VisibilityTypes V); + LLVM_ABI void setVisibility(VisibilityTypes V); // TODO: Add missing functions. }; @@ -996,7 +998,7 @@ class GlobalObject : public GlobalValue { /// /// Setting the section to the empty string tells LLVM to choose an /// appropriate default object file section. - void setSection(StringRef S); + LLVM_ABI void setSection(StringRef S); bool hasComdat() const { return cast(Val)->hasComdat(); } @@ -1031,7 +1033,7 @@ class GlobalWithNodeAPI : public ParentT { struct LLVMGVToGV { Context &Ctx; LLVMGVToGV(Context &Ctx) : Ctx(Ctx) {} - GlobalT &operator()(LLVMGlobalT &LLVMGV) const; + LLVM_ABI GlobalT &operator()(LLVMGlobalT &LLVMGV) const; }; public: @@ -1060,24 +1062,15 @@ class GlobalWithNodeAPI : public ParentT { } }; -// These are needed for SandboxIRTest when building with LLVM_BUILD_LLVM_DYLIB -extern template LLVM_TEMPLATE_ABI GlobalIFunc & -GlobalWithNodeAPI::LLVMGVToGV::operator()(llvm::GlobalIFunc - &LLVMGV) - const; -extern template LLVM_TEMPLATE_ABI Function & -GlobalWithNodeAPI:: - LLVMGVToGV::operator()(llvm::Function &LLVMGV) const; - -extern template LLVM_TEMPLATE_ABI GlobalVariable &GlobalWithNodeAPI< - GlobalVariable, llvm::GlobalVariable, GlobalObject, - llvm::GlobalObject>::LLVMGVToGV::operator()(llvm::GlobalVariable &LLVMGV) - const; -extern template LLVM_TEMPLATE_ABI GlobalAlias & -GlobalWithNodeAPI::LLVMGVToGV::operator()(llvm::GlobalAlias - &LLVMGV) const; +// Explicit instantiations. +extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI< + GlobalIFunc, llvm::GlobalIFunc, GlobalObject, llvm::GlobalObject>; +extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI< + Function, llvm::Function, GlobalObject, llvm::GlobalObject>; +extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI< + GlobalVariable, llvm::GlobalVariable, GlobalObject, llvm::GlobalObject>; +extern template class LLVM_TEMPLATE_ABI GlobalWithNodeAPI< + GlobalAlias, llvm::GlobalAlias, GlobalValue, llvm::GlobalValue>; class GlobalIFunc final : public GlobalWithNodeAPI(this)->getResolverFunction(); } @@ -1136,7 +1129,7 @@ class GlobalVariable final struct LLVMGVToGV { Context &Ctx; LLVMGVToGV(Context &Ctx) : Ctx(Ctx) {} - GlobalVariable &operator()(llvm::GlobalVariable &LLVMGV) const; + LLVM_ABI GlobalVariable &operator()(llvm::GlobalVariable &LLVMGV) const; }; public: @@ -1181,11 +1174,11 @@ class GlobalVariable final /// illegal to call this method if the global is external, because we cannot /// tell what the value is initialized to! /// - Constant *getInitializer() const; + LLVM_ABI Constant *getInitializer() const; /// setInitializer - Sets the initializer for this global variable, removing /// any existing initializer if InitVal==NULL. The initializer must have the /// type getValueType(). - void setInitializer(Constant *InitVal); + LLVM_ABI void setInitializer(Constant *InitVal); // TODO: Add missing replaceInitializer(). Requires special tracker @@ -1196,12 +1189,12 @@ class GlobalVariable final bool isConstant() const { return cast(Val)->isConstant(); } - void setConstant(bool V); + LLVM_ABI void setConstant(bool V); bool isExternallyInitialized() const { return cast(Val)->isExternallyInitialized(); } - void setExternallyInitialized(bool Val); + LLVM_ABI void setExternallyInitialized(bool Val); // TODO: Missing copyAttributesFrom() @@ -1278,7 +1271,7 @@ class GlobalVariable final /// Sets the alignment attribute of the GlobalVariable. /// This method will be deprecated as the alignment property should always be /// defined. - void setAlignment(MaybeAlign Align); + LLVM_ABI void setAlignment(MaybeAlign Align); // TODO: Missing setCodeModel(). Requires custom tracker. @@ -1311,10 +1304,10 @@ class GlobalAlias final // TODO: Missing copyAttributresFrom(). // TODO: Missing removeFromParent(), eraseFromParent(). - void setAliasee(Constant *Aliasee); - Constant *getAliasee() const; + LLVM_ABI void setAliasee(Constant *Aliasee); + LLVM_ABI Constant *getAliasee() const; - const GlobalObject *getAliaseeObject() const; + LLVM_ABI const GlobalObject *getAliaseeObject() const; GlobalObject *getAliaseeObject() { return const_cast( static_cast(this)->getAliaseeObject()); @@ -1336,12 +1329,12 @@ class NoCFIValue final : public Constant { public: /// Return a NoCFIValue for the specified function. - static NoCFIValue *get(GlobalValue *GV); + LLVM_ABI static NoCFIValue *get(GlobalValue *GV); - GlobalValue *getGlobalValue() const; + LLVM_ABI GlobalValue *getGlobalValue() const; /// NoCFIValue is always a pointer. - PointerType *getType() const; + LLVM_ABI PointerType *getType() const; /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From) { return From->getSubclassID() == ClassID::NoCFIValue; @@ -1369,21 +1362,21 @@ class ConstantPtrAuth final : public Constant { public: /// Return a pointer signed with the specified parameters. - static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key, - ConstantInt *Disc, Constant *AddrDisc); + LLVM_ABI static ConstantPtrAuth *get(Constant *Ptr, ConstantInt *Key, + ConstantInt *Disc, Constant *AddrDisc); /// The pointer that is signed in this ptrauth signed pointer. - Constant *getPointer() const; + LLVM_ABI Constant *getPointer() const; /// The Key ID, an i32 constant. - ConstantInt *getKey() const; + LLVM_ABI ConstantInt *getKey() const; /// The integer discriminator, an i64 constant, or 0. - ConstantInt *getDiscriminator() const; + LLVM_ABI ConstantInt *getDiscriminator() const; /// The address discriminator if any, or the null constant. /// If present, this must be a value equivalent to the storage location of /// the only global-initializer user of the ptrauth signed pointer. - Constant *getAddrDiscriminator() const; + LLVM_ABI Constant *getAddrDiscriminator() const; /// Whether there is any non-null address discriminator. bool hasAddressDiscriminator() const { @@ -1410,7 +1403,7 @@ class ConstantPtrAuth final : public Constant { /// Produce a new ptrauth expression signing the given value using /// the same schema as is stored in one. - ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const; + LLVM_ABI ConstantPtrAuth *getWithSameSchema(Constant *Pointer) const; /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From) { @@ -1438,19 +1431,19 @@ class BlockAddress final : public Constant { public: /// Return a BlockAddress for the specified function and basic block. - static BlockAddress *get(Function *F, BasicBlock *BB); + LLVM_ABI static BlockAddress *get(Function *F, BasicBlock *BB); /// Return a BlockAddress for the specified basic block. The basic /// block must be embedded into a function. - static BlockAddress *get(BasicBlock *BB); + LLVM_ABI static BlockAddress *get(BasicBlock *BB); /// Lookup an existing \c BlockAddress constant for the given BasicBlock. /// /// \returns 0 if \c !BB->hasAddressTaken(), otherwise the \c BlockAddress. - static BlockAddress *lookup(const BasicBlock *BB); + LLVM_ABI static BlockAddress *lookup(const BasicBlock *BB); - Function *getFunction() const; - BasicBlock *getBasicBlock() const; + LLVM_ABI Function *getFunction() const; + LLVM_ABI BasicBlock *getBasicBlock() const; /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From) { @@ -1465,9 +1458,9 @@ class DSOLocalEquivalent final : public Constant { public: /// Return a DSOLocalEquivalent for the specified global value. - static DSOLocalEquivalent *get(GlobalValue *GV); + LLVM_ABI static DSOLocalEquivalent *get(GlobalValue *GV); - GlobalValue *getGlobalValue() const; + LLVM_ABI GlobalValue *getGlobalValue() const; /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From) { @@ -1498,7 +1491,7 @@ class ConstantTokenNone final : public Constant { public: /// Return the ConstantTokenNone. - static ConstantTokenNone *get(Context &Ctx); + LLVM_ABI static ConstantTokenNone *get(Context &Ctx); /// For isa/dyn_cast. static bool classof(const sandboxir::Value *From) { diff --git a/llvm/include/llvm/SandboxIR/Context.h b/llvm/include/llvm/SandboxIR/Context.h index a8a21b0db855e..7d8b2c86e94a7 100644 --- a/llvm/include/llvm/SandboxIR/Context.h +++ b/llvm/include/llvm/SandboxIR/Context.h @@ -15,6 +15,7 @@ #include "llvm/IR/LLVMContext.h" #include "llvm/SandboxIR/Tracker.h" #include "llvm/SandboxIR/Type.h" +#include "llvm/Support/Compiler.h" #include @@ -112,32 +113,33 @@ class Context { CallbackID::ValTy NextCallbackID = 1; /// Remove \p V from the maps and returns the unique_ptr. - std::unique_ptr detachLLVMValue(llvm::Value *V); + LLVM_ABI std::unique_ptr detachLLVMValue(llvm::Value *V); /// Remove \p SBV from all SandboxIR maps and stop owning it. This effectively /// detaches \p V from the underlying IR. - std::unique_ptr detach(Value *V); + LLVM_ABI std::unique_ptr detach(Value *V); friend class Instruction; // For detach(). /// Take ownership of VPtr and store it in `LLVMValueToValueMap`. - Value *registerValue(std::unique_ptr &&VPtr); + LLVM_ABI Value *registerValue(std::unique_ptr &&VPtr); friend class EraseFromParent; // For registerValue(). /// This is the actual function that creates sandboxir values for \p V, /// and among others handles all instruction types. - Value *getOrCreateValueInternal(llvm::Value *V, llvm::User *U = nullptr); + LLVM_ABI Value *getOrCreateValueInternal(llvm::Value *V, + llvm::User *U = nullptr); /// Get or create a sandboxir::Argument for an existing LLVM IR \p LLVMArg. - Argument *getOrCreateArgument(llvm::Argument *LLVMArg); + LLVM_ABI Argument *getOrCreateArgument(llvm::Argument *LLVMArg); /// Get or create a sandboxir::Value for an existing LLVM IR \p LLVMV. Value *getOrCreateValue(llvm::Value *LLVMV) { return getOrCreateValueInternal(LLVMV, 0); } /// Get or create a sandboxir::Constant from an existing LLVM IR \p LLVMC. - Constant *getOrCreateConstant(llvm::Constant *LLVMC); + LLVM_ABI Constant *getOrCreateConstant(llvm::Constant *LLVMC); friend class ConstantDataSequential; // For getOrCreateConstant(). friend class Utils; // For getMemoryBase - void runEraseInstrCallbacks(Instruction *I); - void runCreateInstrCallbacks(Instruction *I); - void runMoveInstrCallbacks(Instruction *I, const BBIterator &Where); - void runSetUseCallbacks(const Use &U, Value *NewSrc); + LLVM_ABI void runEraseInstrCallbacks(Instruction *I); + LLVM_ABI void runCreateInstrCallbacks(Instruction *I); + LLVM_ABI void runMoveInstrCallbacks(Instruction *I, const BBIterator &Where); + LLVM_ABI void runSetUseCallbacks(const Use &U, Value *NewSrc); friend class User; // For runSetUseCallbacks(). friend class Value; // For runSetUseCallbacks(). @@ -148,90 +150,97 @@ class Context { /// Create a sandboxir::BasicBlock for an existing LLVM IR \p BB. This will /// also create all contents of the block. - BasicBlock *createBasicBlock(llvm::BasicBlock *BB); + LLVM_ABI BasicBlock *createBasicBlock(llvm::BasicBlock *BB); friend class BasicBlock; // For getOrCreateValue(). IRBuilder LLVMIRBuilder; auto &getLLVMIRBuilder() { return LLVMIRBuilder; } - VAArgInst *createVAArgInst(llvm::VAArgInst *SI); + LLVM_ABI VAArgInst *createVAArgInst(llvm::VAArgInst *SI); friend VAArgInst; // For createVAArgInst() - FreezeInst *createFreezeInst(llvm::FreezeInst *SI); + LLVM_ABI FreezeInst *createFreezeInst(llvm::FreezeInst *SI); friend FreezeInst; // For createFreezeInst() - FenceInst *createFenceInst(llvm::FenceInst *SI); + LLVM_ABI FenceInst *createFenceInst(llvm::FenceInst *SI); friend FenceInst; // For createFenceInst() - SelectInst *createSelectInst(llvm::SelectInst *SI); + LLVM_ABI SelectInst *createSelectInst(llvm::SelectInst *SI); friend SelectInst; // For createSelectInst() - InsertElementInst *createInsertElementInst(llvm::InsertElementInst *IEI); + LLVM_ABI InsertElementInst * + createInsertElementInst(llvm::InsertElementInst *IEI); friend InsertElementInst; // For createInsertElementInst() - ExtractElementInst *createExtractElementInst(llvm::ExtractElementInst *EEI); + LLVM_ABI ExtractElementInst * + createExtractElementInst(llvm::ExtractElementInst *EEI); friend ExtractElementInst; // For createExtractElementInst() - ShuffleVectorInst *createShuffleVectorInst(llvm::ShuffleVectorInst *SVI); + LLVM_ABI ShuffleVectorInst * + createShuffleVectorInst(llvm::ShuffleVectorInst *SVI); friend ShuffleVectorInst; // For createShuffleVectorInst() - ExtractValueInst *createExtractValueInst(llvm::ExtractValueInst *IVI); + LLVM_ABI ExtractValueInst * + createExtractValueInst(llvm::ExtractValueInst *IVI); friend ExtractValueInst; // For createExtractValueInst() - InsertValueInst *createInsertValueInst(llvm::InsertValueInst *IVI); + LLVM_ABI InsertValueInst *createInsertValueInst(llvm::InsertValueInst *IVI); friend InsertValueInst; // For createInsertValueInst() - BranchInst *createBranchInst(llvm::BranchInst *I); + LLVM_ABI BranchInst *createBranchInst(llvm::BranchInst *I); friend BranchInst; // For createBranchInst() - LoadInst *createLoadInst(llvm::LoadInst *LI); + LLVM_ABI LoadInst *createLoadInst(llvm::LoadInst *LI); friend LoadInst; // For createLoadInst() - StoreInst *createStoreInst(llvm::StoreInst *SI); + LLVM_ABI StoreInst *createStoreInst(llvm::StoreInst *SI); friend StoreInst; // For createStoreInst() - ReturnInst *createReturnInst(llvm::ReturnInst *I); + LLVM_ABI ReturnInst *createReturnInst(llvm::ReturnInst *I); friend ReturnInst; // For createReturnInst() - CallInst *createCallInst(llvm::CallInst *I); + LLVM_ABI CallInst *createCallInst(llvm::CallInst *I); friend CallInst; // For createCallInst() - InvokeInst *createInvokeInst(llvm::InvokeInst *I); + LLVM_ABI InvokeInst *createInvokeInst(llvm::InvokeInst *I); friend InvokeInst; // For createInvokeInst() - CallBrInst *createCallBrInst(llvm::CallBrInst *I); + LLVM_ABI CallBrInst *createCallBrInst(llvm::CallBrInst *I); friend CallBrInst; // For createCallBrInst() - LandingPadInst *createLandingPadInst(llvm::LandingPadInst *I); + LLVM_ABI LandingPadInst *createLandingPadInst(llvm::LandingPadInst *I); friend LandingPadInst; // For createLandingPadInst() - CatchPadInst *createCatchPadInst(llvm::CatchPadInst *I); + LLVM_ABI CatchPadInst *createCatchPadInst(llvm::CatchPadInst *I); friend CatchPadInst; // For createCatchPadInst() - CleanupPadInst *createCleanupPadInst(llvm::CleanupPadInst *I); + LLVM_ABI CleanupPadInst *createCleanupPadInst(llvm::CleanupPadInst *I); friend CleanupPadInst; // For createCleanupPadInst() - CatchReturnInst *createCatchReturnInst(llvm::CatchReturnInst *I); + LLVM_ABI CatchReturnInst *createCatchReturnInst(llvm::CatchReturnInst *I); friend CatchReturnInst; // For createCatchReturnInst() - CleanupReturnInst *createCleanupReturnInst(llvm::CleanupReturnInst *I); + LLVM_ABI CleanupReturnInst * + createCleanupReturnInst(llvm::CleanupReturnInst *I); friend CleanupReturnInst; // For createCleanupReturnInst() - GetElementPtrInst *createGetElementPtrInst(llvm::GetElementPtrInst *I); + LLVM_ABI GetElementPtrInst * + createGetElementPtrInst(llvm::GetElementPtrInst *I); friend GetElementPtrInst; // For createGetElementPtrInst() - CatchSwitchInst *createCatchSwitchInst(llvm::CatchSwitchInst *I); + LLVM_ABI CatchSwitchInst *createCatchSwitchInst(llvm::CatchSwitchInst *I); friend CatchSwitchInst; // For createCatchSwitchInst() - ResumeInst *createResumeInst(llvm::ResumeInst *I); + LLVM_ABI ResumeInst *createResumeInst(llvm::ResumeInst *I); friend ResumeInst; // For createResumeInst() - SwitchInst *createSwitchInst(llvm::SwitchInst *I); + LLVM_ABI SwitchInst *createSwitchInst(llvm::SwitchInst *I); friend SwitchInst; // For createSwitchInst() - UnaryOperator *createUnaryOperator(llvm::UnaryOperator *I); + LLVM_ABI UnaryOperator *createUnaryOperator(llvm::UnaryOperator *I); friend UnaryOperator; // For createUnaryOperator() - BinaryOperator *createBinaryOperator(llvm::BinaryOperator *I); + LLVM_ABI BinaryOperator *createBinaryOperator(llvm::BinaryOperator *I); friend BinaryOperator; // For createBinaryOperator() - AtomicRMWInst *createAtomicRMWInst(llvm::AtomicRMWInst *I); + LLVM_ABI AtomicRMWInst *createAtomicRMWInst(llvm::AtomicRMWInst *I); friend AtomicRMWInst; // For createAtomicRMWInst() - AtomicCmpXchgInst *createAtomicCmpXchgInst(llvm::AtomicCmpXchgInst *I); + LLVM_ABI AtomicCmpXchgInst * + createAtomicCmpXchgInst(llvm::AtomicCmpXchgInst *I); friend AtomicCmpXchgInst; // For createAtomicCmpXchgInst() - AllocaInst *createAllocaInst(llvm::AllocaInst *I); + LLVM_ABI AllocaInst *createAllocaInst(llvm::AllocaInst *I); friend AllocaInst; // For createAllocaInst() - CastInst *createCastInst(llvm::CastInst *I); + LLVM_ABI CastInst *createCastInst(llvm::CastInst *I); friend CastInst; // For createCastInst() - PHINode *createPHINode(llvm::PHINode *I); + LLVM_ABI PHINode *createPHINode(llvm::PHINode *I); friend PHINode; // For createPHINode() - UnreachableInst *createUnreachableInst(llvm::UnreachableInst *UI); + LLVM_ABI UnreachableInst *createUnreachableInst(llvm::UnreachableInst *UI); friend UnreachableInst; // For createUnreachableInst() - CmpInst *createCmpInst(llvm::CmpInst *I); + LLVM_ABI CmpInst *createCmpInst(llvm::CmpInst *I); friend CmpInst; // For createCmpInst() - ICmpInst *createICmpInst(llvm::ICmpInst *I); + LLVM_ABI ICmpInst *createICmpInst(llvm::ICmpInst *I); friend ICmpInst; // For createICmpInst() - FCmpInst *createFCmpInst(llvm::FCmpInst *I); + LLVM_ABI FCmpInst *createFCmpInst(llvm::FCmpInst *I); friend FCmpInst; // For createFCmpInst() public: - Context(LLVMContext &LLVMCtx); - ~Context(); + LLVM_ABI Context(LLVMContext &LLVMCtx); + LLVM_ABI ~Context(); /// Clears function-level state. - void clear(); + LLVM_ABI void clear(); Tracker &getTracker() { return IRTracker; } /// Convenience function for `getTracker().save()` @@ -241,14 +250,14 @@ class Context { /// Convenience function for `getTracker().accept()` void accept() { IRTracker.accept(); } - sandboxir::Value *getValue(llvm::Value *V) const; + LLVM_ABI sandboxir::Value *getValue(llvm::Value *V) const; const sandboxir::Value *getValue(const llvm::Value *V) const { return getValue(const_cast(V)); } - Module *getModule(llvm::Module *LLVMM) const; + LLVM_ABI Module *getModule(llvm::Module *LLVMM) const; - Module *getOrCreateModule(llvm::Module *LLVMM); + LLVM_ABI Module *getOrCreateModule(llvm::Module *LLVMM); Type *getType(llvm::Type *LLVMTy) { if (LLVMTy == nullptr) @@ -265,10 +274,10 @@ class Context { /// This is the main API function for creating Sandbox IR. /// Note: this will not fully populate its parent module. The only globals /// that will be available are those used within the function. - Function *createFunction(llvm::Function *F); + LLVM_ABI Function *createFunction(llvm::Function *F); /// Create a sandboxir::Module corresponding to \p LLVMM. - Module *createModule(llvm::Module *LLVMM); + LLVM_ABI Module *createModule(llvm::Module *LLVMM); /// \Returns the number of values registered with Context. size_t getNumValues() const { return LLVMValueToValueMap.size(); } @@ -277,26 +286,26 @@ class Context { /// to be removed from its parent. Note that this will also be called when /// reverting the creation of an instruction. /// \Returns a callback ID for later deregistration. - CallbackID registerEraseInstrCallback(EraseInstrCallback CB); - void unregisterEraseInstrCallback(CallbackID ID); + LLVM_ABI CallbackID registerEraseInstrCallback(EraseInstrCallback CB); + LLVM_ABI void unregisterEraseInstrCallback(CallbackID ID); /// Register a callback that gets called right after a SandboxIR instruction /// is created. Note that this will also be called when reverting the removal /// of an instruction. /// \Returns a callback ID for later deregistration. - CallbackID registerCreateInstrCallback(CreateInstrCallback CB); - void unregisterCreateInstrCallback(CallbackID ID); + LLVM_ABI CallbackID registerCreateInstrCallback(CreateInstrCallback CB); + LLVM_ABI void unregisterCreateInstrCallback(CallbackID ID); /// Register a callback that gets called when a SandboxIR instruction is about /// to be moved. Note that this will also be called when reverting a move. /// \Returns a callback ID for later deregistration. - CallbackID registerMoveInstrCallback(MoveInstrCallback CB); - void unregisterMoveInstrCallback(CallbackID ID); + LLVM_ABI CallbackID registerMoveInstrCallback(MoveInstrCallback CB); + LLVM_ABI void unregisterMoveInstrCallback(CallbackID ID); /// Register a callback that gets called when a Use gets set. /// \Returns a callback ID for later deregistration. - CallbackID registerSetUseCallback(SetUseCallback CB); - void unregisterSetUseCallback(CallbackID ID); + LLVM_ABI CallbackID registerSetUseCallback(SetUseCallback CB); + LLVM_ABI void unregisterSetUseCallback(CallbackID ID); }; } // namespace sandboxir diff --git a/llvm/include/llvm/SandboxIR/Function.h b/llvm/include/llvm/SandboxIR/Function.h index 2c4b53ef6c1e6..28c69112b2b7e 100644 --- a/llvm/include/llvm/SandboxIR/Function.h +++ b/llvm/include/llvm/SandboxIR/Function.h @@ -11,6 +11,7 @@ #include "llvm/IR/Function.h" #include "llvm/SandboxIR/Constant.h" +#include "llvm/Support/Compiler.h" namespace llvm::sandboxir { @@ -56,7 +57,7 @@ class Function : public GlobalWithNodeAPI(Val)->end(), BBGetter); } - FunctionType *getFunctionType() const; + LLVM_ABI FunctionType *getFunctionType() const; /// Returns the alignment of the given function. MaybeAlign getAlign() const { return cast(Val)->getAlign(); } @@ -66,7 +67,7 @@ class Function : public GlobalWithNodeAPIgetParent(), Before->getIterator()); @@ -217,9 +218,9 @@ class Instruction : public User { } /// \Returns the BasicBlock containing this Instruction, or null if it is /// detached. - BasicBlock *getParent() const; + LLVM_ABI BasicBlock *getParent() const; /// For isa/dyn_cast. - static bool classof(const sandboxir::Value *From); + LLVM_ABI static bool classof(const sandboxir::Value *From); /// Determine whether the no signed wrap flag is set. bool hasNoUnsignedWrap() const { @@ -227,20 +228,20 @@ class Instruction : public User { } /// Set or clear the nuw flag on this instruction, which must be an operator /// which supports this flag. See LangRef.html for the meaning of this flag. - void setHasNoUnsignedWrap(bool B = true); + LLVM_ABI void setHasNoUnsignedWrap(bool B = true); /// Determine whether the no signed wrap flag is set. bool hasNoSignedWrap() const { return cast(Val)->hasNoSignedWrap(); } /// Set or clear the nsw flag on this instruction, which must be an operator /// which supports this flag. See LangRef.html for the meaning of this flag. - void setHasNoSignedWrap(bool B = true); + LLVM_ABI void setHasNoSignedWrap(bool B = true); /// Determine whether all fast-math-flags are set. bool isFast() const { return cast(Val)->isFast(); } /// Set or clear all fast-math-flags on this instruction, which must be an /// operator which supports this flag. See LangRef.html for the meaning of /// this flag. - void setFast(bool B); + LLVM_ABI void setFast(bool B); /// Determine whether the allow-reassociation flag is set. bool hasAllowReassoc() const { return cast(Val)->hasAllowReassoc(); @@ -248,24 +249,24 @@ class Instruction : public User { /// Set or clear the reassociation flag on this instruction, which must be /// an operator which supports this flag. See LangRef.html for the meaning of /// this flag. - void setHasAllowReassoc(bool B); + LLVM_ABI void setHasAllowReassoc(bool B); /// Determine whether the exact flag is set. bool isExact() const { return cast(Val)->isExact(); } /// Set or clear the exact flag on this instruction, which must be an operator /// which supports this flag. See LangRef.html for the meaning of this flag. - void setIsExact(bool B = true); + LLVM_ABI void setIsExact(bool B = true); /// Determine whether the no-NaNs flag is set. bool hasNoNaNs() const { return cast(Val)->hasNoNaNs(); } /// Set or clear the no-nans flag on this instruction, which must be an /// operator which supports this flag. See LangRef.html for the meaning of /// this flag. - void setHasNoNaNs(bool B); + LLVM_ABI void setHasNoNaNs(bool B); /// Determine whether the no-infs flag is set. bool hasNoInfs() const { return cast(Val)->hasNoInfs(); } /// Set or clear the no-infs flag on this instruction, which must be an /// operator which supports this flag. See LangRef.html for the meaning of /// this flag. - void setHasNoInfs(bool B); + LLVM_ABI void setHasNoInfs(bool B); /// Determine whether the no-signed-zeros flag is set. bool hasNoSignedZeros() const { return cast(Val)->hasNoSignedZeros(); @@ -273,7 +274,7 @@ class Instruction : public User { /// Set or clear the no-signed-zeros flag on this instruction, which must be /// an operator which supports this flag. See LangRef.html for the meaning of /// this flag. - void setHasNoSignedZeros(bool B); + LLVM_ABI void setHasNoSignedZeros(bool B); /// Determine whether the allow-reciprocal flag is set. bool hasAllowReciprocal() const { return cast(Val)->hasAllowReciprocal(); @@ -281,7 +282,7 @@ class Instruction : public User { /// Set or clear the allow-reciprocal flag on this instruction, which must be /// an operator which supports this flag. See LangRef.html for the meaning of /// this flag. - void setHasAllowReciprocal(bool B); + LLVM_ABI void setHasAllowReciprocal(bool B); /// Determine whether the allow-contract flag is set. bool hasAllowContract() const { return cast(Val)->hasAllowContract(); @@ -289,7 +290,7 @@ class Instruction : public User { /// Set or clear the allow-contract flag on this instruction, which must be /// an operator which supports this flag. See LangRef.html for the meaning of /// this flag. - void setHasAllowContract(bool B); + LLVM_ABI void setHasAllowContract(bool B); /// Determine whether the approximate-math-functions flag is set. bool hasApproxFunc() const { return cast(Val)->hasApproxFunc(); @@ -297,7 +298,7 @@ class Instruction : public User { /// Set or clear the approximate-math-functions flag on this instruction, /// which must be an operator which supports this flag. See LangRef.html for /// the meaning of this flag. - void setHasApproxFunc(bool B); + LLVM_ABI void setHasApproxFunc(bool B); /// Convenience function for getting all the fast-math flags, which must be an /// operator which supports these flags. See LangRef.html for the meaning of /// these flags. @@ -307,11 +308,11 @@ class Instruction : public User { /// Convenience function for setting multiple fast-math flags on this /// instruction, which must be an operator which supports these flags. See /// LangRef.html for the meaning of these flags. - void setFastMathFlags(FastMathFlags FMF); + LLVM_ABI void setFastMathFlags(FastMathFlags FMF); /// Convenience function for transferring all fast-math flag values to this /// instruction, which must be an operator which supports these flags. See /// LangRef.html for the meaning of these flags. - void copyFastMathFlags(FastMathFlags FMF); + LLVM_ABI void copyFastMathFlags(FastMathFlags FMF); bool isAssociative() const { return cast(Val)->isAssociative(); @@ -352,7 +353,7 @@ class Instruction : public User { bool isVolatile() const { return cast(Val)->isVolatile(); } - Type *getAccessType() const; + LLVM_ABI Type *getAccessType() const; bool mayThrow(bool IncludePhaseOneUnwind = false) const { return cast(Val)->mayThrow(IncludePhaseOneUnwind); @@ -414,22 +415,22 @@ class FenceInst : public SingleLLVMInstructionImpl { friend Context; // For constructor; public: - static FenceInst *create(AtomicOrdering Ordering, InsertPosition Pos, - Context &Ctx, - SyncScope::ID SSID = SyncScope::System); + LLVM_ABI static FenceInst *create(AtomicOrdering Ordering, InsertPosition Pos, + Context &Ctx, + SyncScope::ID SSID = SyncScope::System); /// Returns the ordering constraint of this fence instruction. AtomicOrdering getOrdering() const { return cast(Val)->getOrdering(); } /// Sets the ordering constraint of this fence instruction. May only be /// Acquire, Release, AcquireRelease, or SequentiallyConsistent. - void setOrdering(AtomicOrdering Ordering); + LLVM_ABI void setOrdering(AtomicOrdering Ordering); /// Returns the synchronization scope ID of this fence instruction. SyncScope::ID getSyncScopeID() const { return cast(Val)->getSyncScopeID(); } /// Sets the synchronization scope ID of this fence instruction. - void setSyncScopeID(SyncScope::ID SSID); + LLVM_ABI void setSyncScopeID(SyncScope::ID SSID); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::Fence; } @@ -443,9 +444,9 @@ class SelectInst : public SingleLLVMInstructionImpl { friend Context; // for SelectInst() public: - static Value *create(Value *Cond, Value *True, Value *False, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *create(Value *Cond, Value *True, Value *False, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); const Value *getCondition() const { return getOperand(0); } const Value *getTrueValue() const { return getOperand(1); } @@ -457,7 +458,7 @@ class SelectInst : public SingleLLVMInstructionImpl { void setCondition(Value *New) { setOperand(0, New); } void setTrueValue(Value *New) { setOperand(1, New); } void setFalseValue(Value *New) { setOperand(2, New); } - void swapValues(); + LLVM_ABI void swapValues(); /// Return a string if the specified operands are invalid for a select /// operation, otherwise return null. @@ -468,7 +469,7 @@ class SelectInst : public SingleLLVMInstructionImpl { } /// For isa/dyn_cast. - static bool classof(const Value *From); + LLVM_ABI static bool classof(const Value *From); }; class InsertElementInst final @@ -480,9 +481,9 @@ class InsertElementInst final friend class Context; // For accessing the constructor in create*() public: - static Value *create(Value *Vec, Value *NewElt, Value *Idx, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *create(Value *Vec, Value *NewElt, Value *Idx, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::InsertElement; } @@ -503,8 +504,8 @@ class ExtractElementInst final // create*() public: - static Value *create(Value *Vec, Value *Idx, InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *create(Value *Vec, Value *Idx, InsertPosition Pos, + Context &Ctx, const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::ExtractElement; } @@ -516,7 +517,7 @@ class ExtractElementInst final Value *getIndexOperand() { return getOperand(1); } const Value *getVectorOperand() const { return getOperand(0); } const Value *getIndexOperand() const { return getOperand(1); } - VectorType *getVectorOperandType() const; + LLVM_ABI VectorType *getVectorOperandType() const; }; class ShuffleVectorInst final @@ -528,18 +529,19 @@ class ShuffleVectorInst final friend class Context; // For accessing the constructor in create*() public: - static Value *create(Value *V1, Value *V2, Value *Mask, InsertPosition Pos, - Context &Ctx, const Twine &Name = ""); - static Value *create(Value *V1, Value *V2, ArrayRef Mask, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *create(Value *V1, Value *V2, Value *Mask, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); + LLVM_ABI static Value *create(Value *V1, Value *V2, ArrayRef Mask, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::ShuffleVector; } /// Swap the operands and adjust the mask to preserve the semantics of the /// instruction. - void commute(); + LLVM_ABI void commute(); /// Return true if a shufflevector instruction can be formed with the /// specified operands. @@ -554,7 +556,7 @@ class ShuffleVectorInst final } /// Overload to return most specific vector type. - VectorType *getType() const; + LLVM_ABI VectorType *getType() const; /// Return the shuffle mask value of this instruction for the given element /// index. Return PoisonMaskElem if the element is undef. @@ -577,12 +579,12 @@ class ShuffleVectorInst final } /// Return the mask for this instruction, for use in bitcode. - Constant *getShuffleMaskForBitcode() const; + LLVM_ABI Constant *getShuffleMaskForBitcode() const; - static Constant *convertShuffleMaskForBitcode(ArrayRef Mask, - Type *ResultTy); + LLVM_ABI static Constant *convertShuffleMaskForBitcode(ArrayRef Mask, + Type *ResultTy); - void setShuffleMask(ArrayRef Mask); + LLVM_ABI void setShuffleMask(ArrayRef Mask); ArrayRef getShuffleMask() const { return cast(Val)->getShuffleMask(); @@ -965,9 +967,9 @@ class InsertValueInst friend Context; // for InsertValueInst() public: - static Value *create(Value *Agg, Value *Val, ArrayRef Idxs, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *create(Value *Agg, Value *Val, ArrayRef Idxs, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::InsertValue; @@ -1024,36 +1026,37 @@ class BranchInst : public SingleLLVMInstructionImpl { friend Context; // for BranchInst() public: - static BranchInst *create(BasicBlock *IfTrue, InsertPosition Pos, - Context &Ctx); - static BranchInst *create(BasicBlock *IfTrue, BasicBlock *IfFalse, - Value *Cond, InsertPosition Pos, Context &Ctx); + LLVM_ABI static BranchInst *create(BasicBlock *IfTrue, InsertPosition Pos, + Context &Ctx); + LLVM_ABI static BranchInst *create(BasicBlock *IfTrue, BasicBlock *IfFalse, + Value *Cond, InsertPosition Pos, + Context &Ctx); /// For isa/dyn_cast. - static bool classof(const Value *From); + LLVM_ABI static bool classof(const Value *From); bool isUnconditional() const { return cast(Val)->isUnconditional(); } bool isConditional() const { return cast(Val)->isConditional(); } - Value *getCondition() const; + LLVM_ABI Value *getCondition() const; void setCondition(Value *V) { setOperand(0, V); } unsigned getNumSuccessors() const { return 1 + isConditional(); } - BasicBlock *getSuccessor(unsigned SuccIdx) const; - void setSuccessor(unsigned Idx, BasicBlock *NewSucc); + LLVM_ABI BasicBlock *getSuccessor(unsigned SuccIdx) const; + LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *NewSucc); void swapSuccessors() { swapOperandsInternal(1, 2); } private: struct LLVMBBToSBBB { Context &Ctx; LLVMBBToSBBB(Context &Ctx) : Ctx(Ctx) {} - BasicBlock *operator()(llvm::BasicBlock *BB) const; + LLVM_ABI BasicBlock *operator()(llvm::BasicBlock *BB) const; }; struct ConstLLVMBBToSBBB { Context &Ctx; ConstLLVMBBToSBBB(Context &Ctx) : Ctx(Ctx) {} - const BasicBlock *operator()(const llvm::BasicBlock *BB) const; + LLVM_ABI const BasicBlock *operator()(const llvm::BasicBlock *BB) const; }; public: @@ -1109,8 +1112,9 @@ class ExtractValueInst : public UnaryInstruction { friend Context; // for ExtractValueInst() public: - static Value *create(Value *Agg, ArrayRef Idxs, InsertPosition Pos, - Context &Ctx, const Twine &Name = ""); + LLVM_ABI static Value *create(Value *Agg, ArrayRef Idxs, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::ExtractValue; @@ -1120,7 +1124,7 @@ class ExtractValueInst : public UnaryInstruction { /// with an extractvalue instruction with the specified parameters. /// /// Null is returned if the indices are invalid for the specified type. - static Type *getIndexedType(Type *Agg, ArrayRef Idxs); + LLVM_ABI static Type *getIndexedType(Type *Agg, ArrayRef Idxs); using idx_iterator = llvm::ExtractValueInst::idx_iterator; @@ -1163,9 +1167,9 @@ class VAArgInst : public UnaryInstruction { friend Context; // For constructor; public: - static VAArgInst *create(Value *List, Type *Ty, InsertPosition Pos, - Context &Ctx, const Twine &Name = ""); - Value *getPointerOperand(); + LLVM_ABI static VAArgInst *create(Value *List, Type *Ty, InsertPosition Pos, + Context &Ctx, const Twine &Name = ""); + LLVM_ABI Value *getPointerOperand(); const Value *getPointerOperand() const { return const_cast(this)->getPointerOperand(); } @@ -1183,8 +1187,8 @@ class FreezeInst : public UnaryInstruction { friend Context; // For constructor; public: - static FreezeInst *create(Value *V, InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static FreezeInst *create(Value *V, InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::Freeze; } @@ -1200,11 +1204,11 @@ class LoadInst final : public UnaryInstruction { /// Return true if this is a load from a volatile memory location. bool isVolatile() const { return cast(Val)->isVolatile(); } /// Specify whether this is a volatile load or not. - void setVolatile(bool V); + LLVM_ABI void setVolatile(bool V); - static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align, - InsertPosition Pos, bool IsVolatile, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align, + InsertPosition Pos, bool IsVolatile, + Context &Ctx, const Twine &Name = ""); static LoadInst *create(Type *Ty, Value *Ptr, MaybeAlign Align, InsertPosition Pos, Context &Ctx, const Twine &Name = "") { @@ -1212,8 +1216,8 @@ class LoadInst final : public UnaryInstruction { } /// For isa/dyn_cast. - static bool classof(const Value *From); - Value *getPointerOperand() const; + LLVM_ABI static bool classof(const Value *From); + LLVM_ABI Value *getPointerOperand() const; Align getAlign() const { return cast(Val)->getAlign(); } bool isUnordered() const { return cast(Val)->isUnordered(); } bool isSimple() const { return cast(Val)->isSimple(); } @@ -1229,19 +1233,20 @@ class StoreInst final : public SingleLLVMInstructionImpl { /// Return true if this is a store from a volatile memory location. bool isVolatile() const { return cast(Val)->isVolatile(); } /// Specify whether this is a volatile store or not. - void setVolatile(bool V); + LLVM_ABI void setVolatile(bool V); - static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align, - InsertPosition Pos, bool IsVolatile, Context &Ctx); + LLVM_ABI static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align, + InsertPosition Pos, bool IsVolatile, + Context &Ctx); static StoreInst *create(Value *V, Value *Ptr, MaybeAlign Align, InsertPosition Pos, Context &Ctx) { return create(V, Ptr, Align, Pos, /*IsVolatile=*/false, Ctx); } /// For isa/dyn_cast. - static bool classof(const Value *From); - Value *getValueOperand() const; - Value *getPointerOperand() const; + LLVM_ABI static bool classof(const Value *From); + LLVM_ABI Value *getValueOperand() const; + LLVM_ABI Value *getPointerOperand() const; Align getAlign() const { return cast(Val)->getAlign(); } bool isSimple() const { return cast(Val)->isSimple(); } bool isUnordered() const { return cast(Val)->isUnordered(); } @@ -1260,8 +1265,8 @@ class UnreachableInst final : public Instruction { } public: - static UnreachableInst *create(InsertPosition Pos, Context &Ctx); - static bool classof(const Value *From); + LLVM_ABI static UnreachableInst *create(InsertPosition Pos, Context &Ctx); + LLVM_ABI static bool classof(const Value *From); unsigned getNumSuccessors() const { return 0; } unsigned getUseOperandNo(const Use &Use) const final { llvm_unreachable("UnreachableInst has no operands!"); @@ -1280,12 +1285,13 @@ class ReturnInst final : public SingleLLVMInstructionImpl { Context &Ctx); public: - static ReturnInst *create(Value *RetVal, InsertPosition Pos, Context &Ctx); + LLVM_ABI static ReturnInst *create(Value *RetVal, InsertPosition Pos, + Context &Ctx); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::Ret; } /// \Returns null if there is no return value. - Value *getReturnValue() const; + LLVM_ABI Value *getReturnValue() const; }; class CallBase : public SingleLLVMInstructionImpl { @@ -1303,7 +1309,7 @@ class CallBase : public SingleLLVMInstructionImpl { Opc == Instruction::ClassID::CallBr; } - FunctionType *getFunctionType() const; + LLVM_ABI FunctionType *getFunctionType() const; op_iterator data_operands_begin() { return op_begin(); } const_op_iterator data_operands_begin() const { @@ -1390,17 +1396,17 @@ class CallBase : public SingleLLVMInstructionImpl { } bool hasArgument(const Value *V) const { return is_contained(args(), V); } - Value *getCalledOperand() const; - Use getCalledOperandUse() const; + LLVM_ABI Value *getCalledOperand() const; + LLVM_ABI Use getCalledOperandUse() const; - Function *getCalledFunction() const; + LLVM_ABI Function *getCalledFunction() const; bool isIndirectCall() const { return cast(Val)->isIndirectCall(); } bool isCallee(Use U) const { return cast(Val)->isCallee(U.LLVMUse); } - Function *getCaller(); + LLVM_ABI Function *getCaller(); const Function *getCaller() const { return const_cast(this)->getCaller(); } @@ -1412,7 +1418,7 @@ class CallBase : public SingleLLVMInstructionImpl { return cast(Val)->getIntrinsicID(); } void setCalledOperand(Value *V) { getCalledOperandUse().set(V); } - void setCalledFunction(Function *F); + LLVM_ABI void setCalledFunction(Function *F); CallingConv::ID getCallingConv() const { return cast(Val)->getCallingConv(); } @@ -1428,9 +1434,9 @@ class CallInst : public CallBase { friend class IntrinsicInst; // For constructor public: - static CallInst *create(FunctionType *FTy, Value *Func, - ArrayRef Args, InsertPosition Pos, - Context &Ctx, const Twine &NameStr = ""); + LLVM_ABI static CallInst *create(FunctionType *FTy, Value *Func, + ArrayRef Args, InsertPosition Pos, + Context &Ctx, const Twine &NameStr = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::Call; @@ -1446,20 +1452,21 @@ class InvokeInst final : public CallBase { // create*() public: - static InvokeInst *create(FunctionType *FTy, Value *Func, - BasicBlock *IfNormal, BasicBlock *IfException, - ArrayRef Args, InsertPosition Pos, - Context &Ctx, const Twine &NameStr = ""); + LLVM_ABI static InvokeInst *create(FunctionType *FTy, Value *Func, + BasicBlock *IfNormal, + BasicBlock *IfException, + ArrayRef Args, InsertPosition Pos, + Context &Ctx, const Twine &NameStr = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::Invoke; } - BasicBlock *getNormalDest() const; - BasicBlock *getUnwindDest() const; - void setNormalDest(BasicBlock *BB); - void setUnwindDest(BasicBlock *BB); - LandingPadInst *getLandingPadInst() const; - BasicBlock *getSuccessor(unsigned SuccIdx) const; + LLVM_ABI BasicBlock *getNormalDest() const; + LLVM_ABI BasicBlock *getUnwindDest() const; + LLVM_ABI void setNormalDest(BasicBlock *BB); + LLVM_ABI void setUnwindDest(BasicBlock *BB); + LLVM_ABI LandingPadInst *getLandingPadInst() const; + LLVM_ABI BasicBlock *getSuccessor(unsigned SuccIdx) const; void setSuccessor(unsigned SuccIdx, BasicBlock *NewSucc) { assert(SuccIdx < 2 && "Successor # out of range for invoke!"); if (SuccIdx == 0) @@ -1481,25 +1488,25 @@ class CallBrInst final : public CallBase { // create*() public: - static CallBrInst *create(FunctionType *FTy, Value *Func, - BasicBlock *DefaultDest, - ArrayRef IndirectDests, - ArrayRef Args, InsertPosition Pos, - Context &Ctx, const Twine &NameStr = ""); + LLVM_ABI static CallBrInst *create(FunctionType *FTy, Value *Func, + BasicBlock *DefaultDest, + ArrayRef IndirectDests, + ArrayRef Args, InsertPosition Pos, + Context &Ctx, const Twine &NameStr = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::CallBr; } unsigned getNumIndirectDests() const { return cast(Val)->getNumIndirectDests(); } - Value *getIndirectDestLabel(unsigned Idx) const; - Value *getIndirectDestLabelUse(unsigned Idx) const; - BasicBlock *getDefaultDest() const; - BasicBlock *getIndirectDest(unsigned Idx) const; - SmallVector getIndirectDests() const; - void setDefaultDest(BasicBlock *BB); - void setIndirectDest(unsigned Idx, BasicBlock *BB); - BasicBlock *getSuccessor(unsigned Idx) const; + LLVM_ABI Value *getIndirectDestLabel(unsigned Idx) const; + LLVM_ABI Value *getIndirectDestLabelUse(unsigned Idx) const; + LLVM_ABI BasicBlock *getDefaultDest() const; + LLVM_ABI BasicBlock *getIndirectDest(unsigned Idx) const; + LLVM_ABI SmallVector getIndirectDests() const; + LLVM_ABI void setDefaultDest(BasicBlock *BB); + LLVM_ABI void setIndirectDest(unsigned Idx, BasicBlock *BB); + LLVM_ABI BasicBlock *getSuccessor(unsigned Idx) const; unsigned getNumSuccessors() const { return cast(Val)->getNumSuccessors(); } @@ -1512,9 +1519,10 @@ class LandingPadInst : public SingleLLVMInstructionImpl { friend class Context; // For constructor. public: - static LandingPadInst *create(Type *RetTy, unsigned NumReservedClauses, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static LandingPadInst *create(Type *RetTy, + unsigned NumReservedClauses, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); /// Return 'true' if this landingpad instruction is a /// cleanup. I.e., it should be run when unwinding even if its landing pad /// doesn't catch the exception. @@ -1522,14 +1530,14 @@ class LandingPadInst : public SingleLLVMInstructionImpl { return cast(Val)->isCleanup(); } /// Indicate that this landingpad instruction is a cleanup. - void setCleanup(bool V); + LLVM_ABI void setCleanup(bool V); // TODO: We are not implementing addClause() because we have no way to revert // it for now. /// Get the value of the clause at index Idx. Use isCatch/isFilter to /// determine what type of clause this is. - Constant *getClause(unsigned Idx) const; + LLVM_ABI Constant *getClause(unsigned Idx) const; /// Return 'true' if the clause and index Idx is a catch clause. bool isCatch(unsigned Idx) const { @@ -1565,12 +1573,12 @@ class FuncletPadInst : public SingleLLVMInstructionImpl { /// /// Note: This returns the associated CatchSwitchInst if this FuncletPadInst /// is a CatchPadInst. - Value *getParentPad() const; - void setParentPad(Value *ParentPad); + LLVM_ABI Value *getParentPad() const; + LLVM_ABI void setParentPad(Value *ParentPad); /// Return the Idx-th funcletpad argument. - Value *getArgOperand(unsigned Idx) const; + LLVM_ABI Value *getArgOperand(unsigned Idx) const; /// Set the Idx-th funcletpad argument. - void setArgOperand(unsigned Idx, Value *V); + LLVM_ABI void setArgOperand(unsigned Idx, Value *V); // TODO: Implement missing functions: arg_operands(). static bool classof(const Value *From) { @@ -1585,13 +1593,13 @@ class CatchPadInst : public FuncletPadInst { friend class Context; // For constructor. public: - CatchSwitchInst *getCatchSwitch() const; + LLVM_ABI CatchSwitchInst *getCatchSwitch() const; // TODO: We have not implemented setCatchSwitch() because we can't revert it // for now, as there is no CatchPadInst member function that can undo it. - static CatchPadInst *create(Value *ParentPad, ArrayRef Args, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static CatchPadInst *create(Value *ParentPad, ArrayRef Args, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::CatchPad; } @@ -1603,9 +1611,10 @@ class CleanupPadInst : public FuncletPadInst { friend class Context; // For constructor. public: - static CleanupPadInst *create(Value *ParentPad, ArrayRef Args, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static CleanupPadInst *create(Value *ParentPad, + ArrayRef Args, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::CleanupPad; } @@ -1619,16 +1628,17 @@ class CatchReturnInst friend class Context; // For constructor. public: - static CatchReturnInst *create(CatchPadInst *CatchPad, BasicBlock *BB, - InsertPosition Pos, Context &Ctx); - CatchPadInst *getCatchPad() const; - void setCatchPad(CatchPadInst *CatchPad); - BasicBlock *getSuccessor() const; - void setSuccessor(BasicBlock *NewSucc); + LLVM_ABI static CatchReturnInst *create(CatchPadInst *CatchPad, + BasicBlock *BB, InsertPosition Pos, + Context &Ctx); + LLVM_ABI CatchPadInst *getCatchPad() const; + LLVM_ABI void setCatchPad(CatchPadInst *CatchPad); + LLVM_ABI BasicBlock *getSuccessor() const; + LLVM_ABI void setSuccessor(BasicBlock *NewSucc); unsigned getNumSuccessors() { return cast(Val)->getNumSuccessors(); } - Value *getCatchSwitchParentPad() const; + LLVM_ABI Value *getCatchSwitchParentPad() const; static bool classof(const Value *From) { return From->getSubclassID() == ClassID::CatchRet; } @@ -1642,22 +1652,22 @@ class CleanupReturnInst friend class Context; // For constructor. public: - static CleanupReturnInst *create(CleanupPadInst *CleanupPad, - BasicBlock *UnwindBB, InsertPosition Pos, - Context &Ctx); + LLVM_ABI static CleanupReturnInst *create(CleanupPadInst *CleanupPad, + BasicBlock *UnwindBB, + InsertPosition Pos, Context &Ctx); bool hasUnwindDest() const { return cast(Val)->hasUnwindDest(); } bool unwindsToCaller() const { return cast(Val)->unwindsToCaller(); } - CleanupPadInst *getCleanupPad() const; - void setCleanupPad(CleanupPadInst *CleanupPad); + LLVM_ABI CleanupPadInst *getCleanupPad() const; + LLVM_ABI void setCleanupPad(CleanupPadInst *CleanupPad); unsigned getNumSuccessors() const { return cast(Val)->getNumSuccessors(); } - BasicBlock *getUnwindDest() const; - void setUnwindDest(BasicBlock *NewDest); + LLVM_ABI BasicBlock *getUnwindDest() const; + LLVM_ABI void setUnwindDest(BasicBlock *NewDest); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::CleanupRet; @@ -1677,16 +1687,16 @@ class GetElementPtrInst final // create*() public: - static Value *create(Type *Ty, Value *Ptr, ArrayRef IdxList, - InsertPosition Pos, Context &Ctx, - const Twine &NameStr = ""); + LLVM_ABI static Value *create(Type *Ty, Value *Ptr, ArrayRef IdxList, + InsertPosition Pos, Context &Ctx, + const Twine &NameStr = ""); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::GetElementPtr; } - Type *getSourceElementType() const; - Type *getResultElementType() const; + LLVM_ABI Type *getSourceElementType() const; + LLVM_ABI Type *getResultElementType() const; unsigned getAddressSpace() const { return cast(Val)->getAddressSpace(); } @@ -1706,11 +1716,11 @@ class GetElementPtrInst final return const_cast(this)->indices(); } - Value *getPointerOperand() const; + LLVM_ABI Value *getPointerOperand() const; static unsigned getPointerOperandIndex() { return llvm::GetElementPtrInst::getPointerOperandIndex(); } - Type *getPointerOperandType() const; + LLVM_ABI Type *getPointerOperandType() const; unsigned getPointerAddressSpace() const { return cast(Val)->getPointerAddressSpace(); } @@ -1750,12 +1760,12 @@ class CatchSwitchInst friend class Context; // For accessing the constructor in create*() public: - static CatchSwitchInst *create(Value *ParentPad, BasicBlock *UnwindBB, - unsigned NumHandlers, InsertPosition Pos, - Context &Ctx, const Twine &Name = ""); + LLVM_ABI static CatchSwitchInst * + create(Value *ParentPad, BasicBlock *UnwindBB, unsigned NumHandlers, + InsertPosition Pos, Context &Ctx, const Twine &Name = ""); - Value *getParentPad() const; - void setParentPad(Value *ParentPad); + LLVM_ABI Value *getParentPad() const; + LLVM_ABI void setParentPad(Value *ParentPad); bool hasUnwindDest() const { return cast(Val)->hasUnwindDest(); @@ -1763,8 +1773,8 @@ class CatchSwitchInst bool unwindsToCaller() const { return cast(Val)->unwindsToCaller(); } - BasicBlock *getUnwindDest() const; - void setUnwindDest(BasicBlock *UnwindDest); + LLVM_ABI BasicBlock *getUnwindDest() const; + LLVM_ABI void setUnwindDest(BasicBlock *UnwindDest); unsigned getNumHandlers() const { return cast(Val)->getNumHandlers(); @@ -1810,7 +1820,7 @@ class CatchSwitchInst return make_range(handler_begin(), handler_end()); } - void addHandler(BasicBlock *Dest); + LLVM_ABI void addHandler(BasicBlock *Dest); // TODO: removeHandler() cannot be reverted because there is no equivalent // addHandler() with a handler_iterator to specify the position. So we can't @@ -1839,8 +1849,9 @@ class ResumeInst : public SingleLLVMInstructionImpl { friend class Context; // For accessing the constructor in create*() public: - static ResumeInst *create(Value *Exn, InsertPosition Pos, Context &Ctx); - Value *getValue() const; + LLVM_ABI static ResumeInst *create(Value *Exn, InsertPosition Pos, + Context &Ctx); + LLVM_ABI Value *getValue() const; unsigned getNumSuccessors() const { return cast(Val)->getNumSuccessors(); } @@ -1858,17 +1869,17 @@ class SwitchInst : public SingleLLVMInstructionImpl { static constexpr const unsigned DefaultPseudoIndex = llvm::SwitchInst::DefaultPseudoIndex; - static SwitchInst *create(Value *V, BasicBlock *Dest, unsigned NumCases, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static SwitchInst *create(Value *V, BasicBlock *Dest, + unsigned NumCases, InsertPosition Pos, + Context &Ctx, const Twine &Name = ""); - Value *getCondition() const; - void setCondition(Value *V); - BasicBlock *getDefaultDest() const; + LLVM_ABI Value *getCondition() const; + LLVM_ABI void setCondition(Value *V); + LLVM_ABI BasicBlock *getDefaultDest() const; bool defaultDestUnreachable() const { return cast(Val)->defaultDestUnreachable(); } - void setDefaultDest(BasicBlock *DefaultCase); + LLVM_ABI void setDefaultDest(BasicBlock *DefaultCase); unsigned getNumCases() const { return cast(Val)->getNumCases(); } @@ -1913,9 +1924,9 @@ class SwitchInst : public SingleLLVMInstructionImpl { return I; return case_default(); } - ConstantInt *findCaseDest(BasicBlock *BB); + LLVM_ABI ConstantInt *findCaseDest(BasicBlock *BB); - void addCase(ConstantInt *OnVal, BasicBlock *Dest); + LLVM_ABI void addCase(ConstantInt *OnVal, BasicBlock *Dest); /// This method removes the specified case and its successor from the switch /// instruction. Note that this operation may reorder the remaining cases at /// index idx and above. @@ -1923,13 +1934,13 @@ class SwitchInst : public SingleLLVMInstructionImpl { /// This action invalidates iterators for all cases following the one removed, /// including the case_end() iterator. It returns an iterator for the next /// case. - CaseIt removeCase(CaseIt It); + LLVM_ABI CaseIt removeCase(CaseIt It); unsigned getNumSuccessors() const { return cast(Val)->getNumSuccessors(); } - BasicBlock *getSuccessor(unsigned Idx) const; - void setSuccessor(unsigned Idx, BasicBlock *NewSucc); + LLVM_ABI BasicBlock *getSuccessor(unsigned Idx) const; + LLVM_ABI void setSuccessor(unsigned Idx, BasicBlock *NewSucc); static bool classof(const Value *From) { return From->getSubclassID() == ClassID::Switch; } @@ -1950,11 +1961,13 @@ class UnaryOperator : public UnaryInstruction { Ctx) {} friend Context; // for constructor. public: - static Value *create(Instruction::Opcode Op, Value *OpV, InsertPosition Pos, - Context &Ctx, const Twine &Name = ""); - static Value *createWithCopiedFlags(Instruction::Opcode Op, Value *OpV, - Value *CopyFrom, InsertPosition Pos, - Context &Ctx, const Twine &Name = ""); + LLVM_ABI static Value *create(Instruction::Opcode Op, Value *OpV, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); + LLVM_ABI static Value *createWithCopiedFlags(Instruction::Opcode Op, + Value *OpV, Value *CopyFrom, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); /// For isa/dyn_cast. static bool classof(const Value *From) { return From->getSubclassID() == ClassID::UnOp; @@ -2013,14 +2026,15 @@ class BinaryOperator : public SingleLLVMInstructionImpl { friend class Context; // For constructor. public: - static Value *create(Instruction::Opcode Op, Value *LHS, Value *RHS, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); - - static Value *createWithCopiedFlags(Instruction::Opcode Op, Value *LHS, - Value *RHS, Value *CopyFrom, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *create(Instruction::Opcode Op, Value *LHS, Value *RHS, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); + + LLVM_ABI static Value *createWithCopiedFlags(Instruction::Opcode Op, + Value *LHS, Value *RHS, + Value *CopyFrom, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); /// For isa/dyn_cast. static bool classof(const Value *From) { return From->getSubclassID() == ClassID::BinaryOperator; @@ -2033,7 +2047,7 @@ class BinaryOperator : public SingleLLVMInstructionImpl { /// can also be treated as an add. class PossiblyDisjointInst : public BinaryOperator { public: - void setIsDisjoint(bool B); + LLVM_ABI void setIsDisjoint(bool B); bool isDisjoint() const { return cast(Val)->isDisjoint(); } @@ -2066,24 +2080,24 @@ class AtomicRMWInst : public SingleLLVMInstructionImpl { cast(Val)->setOperation(Op); } Align getAlign() const { return cast(Val)->getAlign(); } - void setAlignment(Align Align); + LLVM_ABI void setAlignment(Align Align); bool isVolatile() const { return cast(Val)->isVolatile(); } - void setVolatile(bool V); + LLVM_ABI void setVolatile(bool V); AtomicOrdering getOrdering() const { return cast(Val)->getOrdering(); } - void setOrdering(AtomicOrdering Ordering); + LLVM_ABI void setOrdering(AtomicOrdering Ordering); SyncScope::ID getSyncScopeID() const { return cast(Val)->getSyncScopeID(); } - void setSyncScopeID(SyncScope::ID SSID); - Value *getPointerOperand(); + LLVM_ABI void setSyncScopeID(SyncScope::ID SSID); + LLVM_ABI Value *getPointerOperand(); const Value *getPointerOperand() const { return const_cast(this)->getPointerOperand(); } - Value *getValOperand(); + LLVM_ABI Value *getValOperand(); const Value *getValOperand() const { return const_cast(this)->getValOperand(); } @@ -2097,11 +2111,10 @@ class AtomicRMWInst : public SingleLLVMInstructionImpl { return From->getSubclassID() == ClassID::AtomicRMW; } - static AtomicRMWInst *create(BinOp Op, Value *Ptr, Value *Val, - MaybeAlign Align, AtomicOrdering Ordering, - InsertPosition Pos, Context &Ctx, - SyncScope::ID SSID = SyncScope::System, - const Twine &Name = ""); + LLVM_ABI static AtomicRMWInst * + create(BinOp Op, Value *Ptr, Value *Val, MaybeAlign Align, + AtomicOrdering Ordering, InsertPosition Pos, Context &Ctx, + SyncScope::ID SSID = SyncScope::System, const Twine &Name = ""); }; class AtomicCmpXchgInst @@ -2119,17 +2132,17 @@ class AtomicCmpXchgInst return cast(Val)->getAlign(); } - void setAlignment(Align Align); + LLVM_ABI void setAlignment(Align Align); /// Return true if this is a cmpxchg from a volatile memory /// location. bool isVolatile() const { return cast(Val)->isVolatile(); } /// Specify whether this is a volatile cmpxchg. - void setVolatile(bool V); + LLVM_ABI void setVolatile(bool V); /// Return true if this cmpxchg may spuriously fail. bool isWeak() const { return cast(Val)->isWeak(); } - void setWeak(bool IsWeak); + LLVM_ABI void setWeak(bool IsWeak); static bool isValidSuccessOrdering(AtomicOrdering Ordering) { return llvm::AtomicCmpXchgInst::isValidSuccessOrdering(Ordering); } @@ -2139,30 +2152,30 @@ class AtomicCmpXchgInst AtomicOrdering getSuccessOrdering() const { return cast(Val)->getSuccessOrdering(); } - void setSuccessOrdering(AtomicOrdering Ordering); + LLVM_ABI void setSuccessOrdering(AtomicOrdering Ordering); AtomicOrdering getFailureOrdering() const { return cast(Val)->getFailureOrdering(); } - void setFailureOrdering(AtomicOrdering Ordering); + LLVM_ABI void setFailureOrdering(AtomicOrdering Ordering); AtomicOrdering getMergedOrdering() const { return cast(Val)->getMergedOrdering(); } SyncScope::ID getSyncScopeID() const { return cast(Val)->getSyncScopeID(); } - void setSyncScopeID(SyncScope::ID SSID); - Value *getPointerOperand(); + LLVM_ABI void setSyncScopeID(SyncScope::ID SSID); + LLVM_ABI Value *getPointerOperand(); const Value *getPointerOperand() const { return const_cast(this)->getPointerOperand(); } - Value *getCompareOperand(); + LLVM_ABI Value *getCompareOperand(); const Value *getCompareOperand() const { return const_cast(this)->getCompareOperand(); } - Value *getNewValOperand(); + LLVM_ABI Value *getNewValOperand(); const Value *getNewValOperand() const { return const_cast(this)->getNewValOperand(); } @@ -2172,7 +2185,7 @@ class AtomicCmpXchgInst return cast(Val)->getPointerAddressSpace(); } - static AtomicCmpXchgInst * + LLVM_ABI static AtomicCmpXchgInst * create(Value *Ptr, Value *Cmp, Value *New, MaybeAlign Align, AtomicOrdering SuccessOrdering, AtomicOrdering FailureOrdering, InsertPosition Pos, Context &Ctx, @@ -2190,9 +2203,10 @@ class AllocaInst final : public UnaryInstruction { friend class Context; // For constructor. public: - static AllocaInst *create(Type *Ty, unsigned AddrSpace, InsertPosition Pos, - Context &Ctx, Value *ArraySize = nullptr, - const Twine &Name = ""); + LLVM_ABI static AllocaInst *create(Type *Ty, unsigned AddrSpace, + InsertPosition Pos, Context &Ctx, + Value *ArraySize = nullptr, + const Twine &Name = ""); /// Return true if there is an allocation size parameter to the allocation /// instruction that is not 1. @@ -2201,12 +2215,12 @@ class AllocaInst final : public UnaryInstruction { } /// Get the number of elements allocated. For a simple allocation of a single /// element, this will return a constant 1 value. - Value *getArraySize(); + LLVM_ABI Value *getArraySize(); const Value *getArraySize() const { return const_cast(this)->getArraySize(); } /// Overload to return most specific pointer type. - PointerType *getType() const; + LLVM_ABI PointerType *getType() const; /// Return the address space for the allocation. unsigned getAddressSpace() const { return cast(Val)->getAddressSpace(); @@ -2222,14 +2236,14 @@ class AllocaInst final : public UnaryInstruction { return cast(Val)->getAllocationSizeInBits(DL); } /// Return the type that is being allocated by the instruction. - Type *getAllocatedType() const; + LLVM_ABI Type *getAllocatedType() const; /// for use only in special circumstances that need to generically /// transform a whole instruction (eg: IR linking and vectorization). - void setAllocatedType(Type *Ty); + LLVM_ABI void setAllocatedType(Type *Ty); /// Return the alignment of the memory that is being allocated by the /// instruction. Align getAlign() const { return cast(Val)->getAlign(); } - void setAlignment(Align Align); + LLVM_ABI void setAlignment(Align Align); /// Return true if this alloca is in the entry block of the function and is a /// constant size. If so, the code generator will fold it into the /// prolog/epilog code, so it is basically free. @@ -2242,7 +2256,7 @@ class AllocaInst final : public UnaryInstruction { return cast(Val)->isUsedWithInAlloca(); } /// Specify whether this alloca is used to represent the arguments to a call. - void setUsedWithInAlloca(bool V); + LLVM_ABI void setUsedWithInAlloca(bool V); static bool classof(const Value *From) { if (auto *I = dyn_cast(From)) @@ -2293,13 +2307,13 @@ class CastInst : public UnaryInstruction { friend Context; // for SBCastInstruction() public: - static Value *create(Type *DestTy, Opcode Op, Value *Operand, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static Value *create(Type *DestTy, Opcode Op, Value *Operand, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); /// For isa/dyn_cast. - static bool classof(const Value *From); - Type *getSrcTy() const; - Type *getDestTy() const; + LLVM_ABI static bool classof(const Value *From); + LLVM_ABI Type *getSrcTy() const; + LLVM_ABI Type *getDestTy() const; }; /// Instruction that can have a nneg flag (zext/uitofp). @@ -2308,7 +2322,7 @@ class PossiblyNonNegInst : public CastInst { bool hasNonNeg() const { return cast(Val)->hasNonNeg(); } - void setNonNeg(bool B); + LLVM_ABI void setNonNeg(bool B); /// For isa/dyn_cast. static bool classof(const Value *From) { if (auto *I = dyn_cast(From)) { @@ -2383,15 +2397,15 @@ class PHINode final : public SingleLLVMInstructionImpl { struct LLVMBBToBB { Context &Ctx; LLVMBBToBB(Context &Ctx) : Ctx(Ctx) {} - BasicBlock *operator()(llvm::BasicBlock *LLVMBB) const; + LLVM_ABI BasicBlock *operator()(llvm::BasicBlock *LLVMBB) const; }; public: - static PHINode *create(Type *Ty, unsigned NumReservedValues, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); + LLVM_ABI static PHINode *create(Type *Ty, unsigned NumReservedValues, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); /// For isa/dyn_cast. - static bool classof(const Value *From); + LLVM_ABI static bool classof(const Value *From); using const_block_iterator = mapped_iterator; @@ -2417,35 +2431,36 @@ class PHINode final : public SingleLLVMInstructionImpl { unsigned getNumIncomingValues() const { return cast(Val)->getNumIncomingValues(); } - Value *getIncomingValue(unsigned Idx) const; - void setIncomingValue(unsigned Idx, Value *V); + LLVM_ABI Value *getIncomingValue(unsigned Idx) const; + LLVM_ABI void setIncomingValue(unsigned Idx, Value *V); static unsigned getOperandNumForIncomingValue(unsigned Idx) { return llvm::PHINode::getOperandNumForIncomingValue(Idx); } static unsigned getIncomingValueNumForOperand(unsigned Idx) { return llvm::PHINode::getIncomingValueNumForOperand(Idx); } - BasicBlock *getIncomingBlock(unsigned Idx) const; - BasicBlock *getIncomingBlock(const Use &U) const; + LLVM_ABI BasicBlock *getIncomingBlock(unsigned Idx) const; + LLVM_ABI BasicBlock *getIncomingBlock(const Use &U) const; - void setIncomingBlock(unsigned Idx, BasicBlock *BB); + LLVM_ABI void setIncomingBlock(unsigned Idx, BasicBlock *BB); - void addIncoming(Value *V, BasicBlock *BB); + LLVM_ABI void addIncoming(Value *V, BasicBlock *BB); - Value *removeIncomingValue(unsigned Idx); - Value *removeIncomingValue(BasicBlock *BB); + LLVM_ABI Value *removeIncomingValue(unsigned Idx); + LLVM_ABI Value *removeIncomingValue(BasicBlock *BB); - int getBasicBlockIndex(const BasicBlock *BB) const; - Value *getIncomingValueForBlock(const BasicBlock *BB) const; + LLVM_ABI int getBasicBlockIndex(const BasicBlock *BB) const; + LLVM_ABI Value *getIncomingValueForBlock(const BasicBlock *BB) const; - Value *hasConstantValue() const; + LLVM_ABI Value *hasConstantValue() const; bool hasConstantOrUndefValue() const { return cast(Val)->hasConstantOrUndefValue(); } bool isComplete() const { return cast(Val)->isComplete(); } - void replaceIncomingBlockWith(const BasicBlock *Old, BasicBlock *New); - void removeIncomingValueIf(function_ref Predicate); + LLVM_ABI void replaceIncomingBlockWith(const BasicBlock *Old, + BasicBlock *New); + LLVM_ABI void removeIncomingValueIf(function_ref Predicate); // TODO: Implement // void copyIncomingBlocks(iterator_range BBRange, // uint32_t ToIdx = 0) @@ -2471,21 +2486,23 @@ class CmpInst : public SingleLLVMInstructionImpl { CmpInst(llvm::CmpInst *CI, Context &Ctx, ClassID Id, Opcode Opc) : SingleLLVMInstructionImpl(Id, Opc, CI, Ctx) {} friend Context; // for CmpInst() - static Value *createCommon(Value *Cond, Value *True, Value *False, - const Twine &Name, IRBuilder<> &Builder, - Context &Ctx); + LLVM_ABI static Value *createCommon(Value *Cond, Value *True, Value *False, + const Twine &Name, IRBuilder<> &Builder, + Context &Ctx); public: using Predicate = llvm::CmpInst::Predicate; - static Value *create(Predicate Pred, Value *S1, Value *S2, InsertPosition Pos, - Context &Ctx, const Twine &Name = ""); - static Value *createWithCopiedFlags(Predicate Pred, Value *S1, Value *S2, - const Instruction *FlagsSource, - InsertPosition Pos, Context &Ctx, - const Twine &Name = ""); - void setPredicate(Predicate P); - void swapOperands(); + LLVM_ABI static Value *create(Predicate Pred, Value *S1, Value *S2, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); + LLVM_ABI static Value *createWithCopiedFlags(Predicate Pred, Value *S1, + Value *S2, + const Instruction *FlagsSource, + InsertPosition Pos, Context &Ctx, + const Twine &Name = ""); + LLVM_ABI void setPredicate(Predicate P); + LLVM_ABI void swapOperands(); WRAP_MEMBER(getPredicate); WRAP_BOTH(isFPPredicate); @@ -2517,7 +2534,7 @@ class CmpInst : public SingleLLVMInstructionImpl { } /// Create a result type for fcmp/icmp - static Type *makeCmpResultType(Type *OpndType); + LLVM_ABI static Type *makeCmpResultType(Type *OpndType); #ifndef NDEBUG void dumpOS(raw_ostream &OS) const override; @@ -2533,7 +2550,7 @@ class ICmpInst : public CmpInst { using LLVMValType = llvm::ICmpInst; public: - void swapOperands(); + LLVM_ABI void swapOperands(); WRAP_BOTH(getSignedPredicate); WRAP_BOTH(getUnsignedPredicate); @@ -2570,7 +2587,7 @@ class FCmpInst : public CmpInst { using LLVMValType = llvm::FCmpInst; public: - void swapOperands(); + LLVM_ABI void swapOperands(); WRAP_BOTH(isEquality); WRAP_MEMBER(isCommutative); diff --git a/llvm/include/llvm/SandboxIR/Module.h b/llvm/include/llvm/SandboxIR/Module.h index 429bb04539bcb..275960392211d 100644 --- a/llvm/include/llvm/SandboxIR/Module.h +++ b/llvm/include/llvm/SandboxIR/Module.h @@ -11,6 +11,7 @@ #include "llvm/ADT/STLExtras.h" #include "llvm/IR/Module.h" +#include "llvm/Support/Compiler.h" #include namespace llvm { @@ -38,7 +39,7 @@ class Module { public: Context &getContext() const { return Ctx; } - Function *getFunction(StringRef Name) const; + LLVM_ABI Function *getFunction(StringRef Name) const; const DataLayout &getDataLayout() const { return LLVMM.getDataLayout(); } @@ -50,7 +51,8 @@ class Module { /// does not exist, return null. If AllowInternal is set to true, this /// function will return types that have InternalLinkage. By default, these /// types are not returned. - GlobalVariable *getGlobalVariable(StringRef Name, bool AllowInternal) const; + LLVM_ABI GlobalVariable *getGlobalVariable(StringRef Name, + bool AllowInternal) const; GlobalVariable *getGlobalVariable(StringRef Name) const { return getGlobalVariable(Name, /*AllowInternal=*/false); } @@ -66,12 +68,12 @@ class Module { /// Return the global alias in the module with the specified name, of /// arbitrary type. This method returns null if a global with the specified /// name is not found. - GlobalAlias *getNamedAlias(StringRef Name) const; + LLVM_ABI GlobalAlias *getNamedAlias(StringRef Name) const; /// Return the global ifunc in the module with the specified name, of /// arbitrary type. This method returns null if a global with the specified /// name is not found. - GlobalIFunc *getNamedIFunc(StringRef Name) const; + LLVM_ABI GlobalIFunc *getNamedIFunc(StringRef Name) const; // TODO: Missing removeGlobalVariable() eraseGlobalVariable(), // insertGlobalVariable() diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h index 55a0301f4756b..6fccaf04b270a 100644 --- a/llvm/include/llvm/SandboxIR/PassManager.h +++ b/llvm/include/llvm/SandboxIR/PassManager.h @@ -18,6 +18,7 @@ #ifndef LLVM_SANDBOXIR_PASSMANAGER_H #define LLVM_SANDBOXIR_PASSMANAGER_H +#include "llvm/Support/Compiler.h" #include #include "llvm/ADT/DenseMap.h" @@ -201,7 +202,7 @@ class PassManager : public ParentPass { } }; -class FunctionPassManager final +class LLVM_ABI FunctionPassManager final : public PassManager { public: FunctionPassManager(StringRef Name) : PassManager(Name) {} @@ -211,7 +212,8 @@ class FunctionPassManager final bool runOnFunction(Function &F, const Analyses &A) final; }; -class RegionPassManager final : public PassManager { +class LLVM_ABI RegionPassManager final + : public PassManager { public: RegionPassManager(StringRef Name) : PassManager(Name) {} RegionPassManager(StringRef Name, StringRef Pipeline, diff --git a/llvm/include/llvm/SandboxIR/Region.h b/llvm/include/llvm/SandboxIR/Region.h index f86199ab6c228..d70f21277fb1b 100644 --- a/llvm/include/llvm/SandboxIR/Region.h +++ b/llvm/include/llvm/SandboxIR/Region.h @@ -9,6 +9,7 @@ #ifndef LLVM_SANDBOXIR_REGION_H #define LLVM_SANDBOXIR_REGION_H +#include "llvm/Support/Compiler.h" #include #include "llvm/ADT/SetVector.h" @@ -30,7 +31,7 @@ class ScoreBoard { /// The cost of all instructions that got removed and replaced by new ones. InstructionCost BeforeCost = 0; /// Helper for both add() and remove(). \Returns the TTI cost of \p I. - InstructionCost getCost(Instruction *I) const; + LLVM_ABI InstructionCost getCost(Instruction *I) const; /// No need to allow copies. ScoreBoard(const ScoreBoard &) = delete; const ScoreBoard &operator=(const ScoreBoard &) = delete; @@ -40,7 +41,7 @@ class ScoreBoard { /// Mark \p I as a newly added instruction to the region. void add(Instruction *I) { AfterCost += getCost(I); } /// Mark \p I as a deleted instruction from the region. - void remove(Instruction *I); + LLVM_ABI void remove(Instruction *I); /// \Returns the cost of the newly added instructions. InstructionCost getAfterCost() const { return AfterCost; } /// \Returns the cost of the Removed instructions. @@ -122,12 +123,12 @@ class Region { /// add an instruction to the auxiliary vector it does get tagged as being a /// member of the region (for ownership reasons), but its cost does not get /// counted because the instruction hasn't been added in the "normal" way. - void addImpl(Instruction *I, bool IgnoreCost); + LLVM_ABI void addImpl(Instruction *I, bool IgnoreCost); /// Adds I to the set. This is the main API for adding an instruction to the /// region. void add(Instruction *I) { addImpl(I, /*IgnoreCost=*/false); } /// Removes I from the set. - void remove(Instruction *I); + LLVM_ABI void remove(Instruction *I); friend class Context; // The callbacks need to call add() and remove(). friend class RegionInternalsAttorney; // For unit tests. friend class RegionsFromBBs; // For add(). @@ -141,8 +142,8 @@ class Region { void removeFromAux(Instruction *I); public: - Region(Context &Ctx, TargetTransformInfo &TTI); - ~Region(); + LLVM_ABI Region(Context &Ctx, TargetTransformInfo &TTI); + LLVM_ABI ~Region(); Context &getContext() const { return Ctx; } /// Returns true if I is in the Region. @@ -150,18 +151,18 @@ class Region { /// Returns true if the Region has no instructions. bool empty() const { return Insts.empty(); } /// Set the auxiliary vector. - void setAux(ArrayRef Aux); + LLVM_ABI void setAux(ArrayRef Aux); /// \Returns the auxiliary vector. const SmallVector &getAux() const { return Aux; } /// Clears all auxiliary data. - void clearAux(); + LLVM_ABI void clearAux(); using iterator = decltype(Insts.begin()); iterator begin() { return Insts.begin(); } iterator end() { return Insts.end(); } iterator_range insts() { return make_range(begin(), end()); } - static SmallVector> + LLVM_ABI static SmallVector> createRegionsFromMD(Function &F, TargetTransformInfo &TTI); /// \Returns the ScoreBoard data structure that keeps track of instr costs. const ScoreBoard &getScoreboard() const { return Scoreboard; } diff --git a/llvm/include/llvm/SandboxIR/Tracker.h b/llvm/include/llvm/SandboxIR/Tracker.h index f7b469965eae8..9a2c9dd516489 100644 --- a/llvm/include/llvm/SandboxIR/Tracker.h +++ b/llvm/include/llvm/SandboxIR/Tracker.h @@ -46,6 +46,8 @@ #include "llvm/IR/IRBuilder.h" #include "llvm/IR/Instruction.h" #include "llvm/SandboxIR/Use.h" +#include "llvm/SandboxIR/Value.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include @@ -149,7 +151,7 @@ class UseSet : public IRChangeBase { #endif }; -class PHIRemoveIncoming : public IRChangeBase { +class LLVM_ABI PHIRemoveIncoming : public IRChangeBase { PHINode *PHI; unsigned RemovedIdx; Value *RemovedV; @@ -165,7 +167,7 @@ class PHIRemoveIncoming : public IRChangeBase { #endif }; -class PHIAddIncoming : public IRChangeBase { +class LLVM_ABI PHIAddIncoming : public IRChangeBase { PHINode *PHI; unsigned Idx; @@ -179,7 +181,7 @@ class PHIAddIncoming : public IRChangeBase { #endif }; -class CmpSwapOperands : public IRChangeBase { +class LLVM_ABI CmpSwapOperands : public IRChangeBase { CmpInst *Cmp; public: @@ -210,7 +212,7 @@ class UseSwap : public IRChangeBase { #endif }; -class EraseFromParent : public IRChangeBase { +class LLVM_ABI EraseFromParent : public IRChangeBase { /// Contains all the data we need to restore an "erased" (i.e., detached) /// instruction: the instruction itself and its operands in order. struct InstrAndOperands { @@ -242,7 +244,7 @@ class EraseFromParent : public IRChangeBase { #endif }; -class RemoveFromParent : public IRChangeBase { +class LLVM_ABI RemoveFromParent : public IRChangeBase { /// The instruction that is about to get removed. Instruction *RemovedI = nullptr; /// This is either the next instr, or the parent BB if at the end of the BB. @@ -327,7 +329,7 @@ class GenericSetterWithIdx final : public IRChangeBase { #endif }; -class CatchSwitchAddHandler : public IRChangeBase { +class LLVM_ABI CatchSwitchAddHandler : public IRChangeBase { CatchSwitchInst *CSI; unsigned HandlerIdx; @@ -344,7 +346,7 @@ class CatchSwitchAddHandler : public IRChangeBase { #endif // NDEBUG }; -class SwitchAddCase : public IRChangeBase { +class LLVM_ABI SwitchAddCase : public IRChangeBase { SwitchInst *Switch; ConstantInt *Val; @@ -359,7 +361,7 @@ class SwitchAddCase : public IRChangeBase { #endif // NDEBUG }; -class SwitchRemoveCase : public IRChangeBase { +class LLVM_ABI SwitchRemoveCase : public IRChangeBase { SwitchInst *Switch; struct Case { ConstantInt *Val; @@ -378,7 +380,7 @@ class SwitchRemoveCase : public IRChangeBase { #endif // NDEBUG }; -class MoveInstr : public IRChangeBase { +class LLVM_ABI MoveInstr : public IRChangeBase { /// The instruction that moved. Instruction *MovedI; /// This is either the next instruction in the block, or the parent BB if at @@ -395,7 +397,7 @@ class MoveInstr : public IRChangeBase { #endif // NDEBUG }; -class InsertIntoBB final : public IRChangeBase { +class LLVM_ABI InsertIntoBB final : public IRChangeBase { Instruction *InsertedI = nullptr; public: @@ -408,7 +410,7 @@ class InsertIntoBB final : public IRChangeBase { #endif // NDEBUG }; -class CreateAndInsertInst final : public IRChangeBase { +class LLVM_ABI CreateAndInsertInst final : public IRChangeBase { Instruction *NewI = nullptr; public: @@ -421,7 +423,7 @@ class CreateAndInsertInst final : public IRChangeBase { #endif }; -class ShuffleVectorSetMask final : public IRChangeBase { +class LLVM_ABI ShuffleVectorSetMask final : public IRChangeBase { ShuffleVectorInst *SVI; SmallVector PrevMask; @@ -472,7 +474,7 @@ class Tracker { { } - ~Tracker(); + LLVM_ABI ~Tracker(); Context &getContext() const { return Ctx; } /// \Returns true if there are no changes tracked. bool empty() const { return Changes.empty(); } @@ -506,11 +508,11 @@ class Tracker { /// \Returns the current state of the tracker. TrackerState getState() const { return State; } /// Turns on IR tracking. - void save(); + LLVM_ABI void save(); /// Stops tracking and accept changes. - void accept(); + LLVM_ABI void accept(); /// Stops tracking and reverts to saved state. - void revert(); + LLVM_ABI void revert(); #ifndef NDEBUG void dump(raw_ostream &OS) const; diff --git a/llvm/include/llvm/SandboxIR/Type.h b/llvm/include/llvm/SandboxIR/Type.h index f90ae096443b5..d9c5e6c098dad 100644 --- a/llvm/include/llvm/SandboxIR/Type.h +++ b/llvm/include/llvm/SandboxIR/Type.h @@ -16,6 +16,7 @@ #include "llvm/ADT/SmallPtrSet.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Type.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" @@ -264,18 +265,18 @@ class Type { /// If this is a vector type, return the element type, otherwise return /// 'this'. - Type *getScalarType() const; + LLVM_ABI Type *getScalarType() const; // TODO: ADD MISSING - static Type *getInt64Ty(Context &Ctx); - static Type *getInt32Ty(Context &Ctx); - static Type *getInt16Ty(Context &Ctx); - static Type *getInt8Ty(Context &Ctx); - static Type *getInt1Ty(Context &Ctx); - static Type *getDoubleTy(Context &Ctx); - static Type *getFloatTy(Context &Ctx); - static Type *getHalfTy(Context &Ctx); + LLVM_ABI static Type *getInt64Ty(Context &Ctx); + LLVM_ABI static Type *getInt32Ty(Context &Ctx); + LLVM_ABI static Type *getInt16Ty(Context &Ctx); + LLVM_ABI static Type *getInt8Ty(Context &Ctx); + LLVM_ABI static Type *getInt1Ty(Context &Ctx); + LLVM_ABI static Type *getDoubleTy(Context &Ctx); + LLVM_ABI static Type *getFloatTy(Context &Ctx); + LLVM_ABI static Type *getHalfTy(Context &Ctx); // TODO: missing get* /// Get the address space of this pointer or pointer vector type. @@ -293,7 +294,7 @@ class PointerType : public Type { public: // TODO: add missing functions - static PointerType *get(Context &Ctx, unsigned AddressSpace); + LLVM_ABI static PointerType *get(Context &Ctx, unsigned AddressSpace); static bool classof(const Type *From) { return isa(From->LLVMTy); @@ -302,7 +303,7 @@ class PointerType : public Type { class ArrayType : public Type { public: - static ArrayType *get(Type *ElementType, uint64_t NumElements); + LLVM_ABI static ArrayType *get(Type *ElementType, uint64_t NumElements); // TODO: add missing functions static bool classof(const Type *From) { return isa(From->LLVMTy); @@ -312,8 +313,8 @@ class ArrayType : public Type { class StructType : public Type { public: /// This static method is the primary way to create a literal StructType. - static StructType *get(Context &Ctx, ArrayRef Elements, - bool IsPacked = false); + LLVM_ABI static StructType *get(Context &Ctx, ArrayRef Elements, + bool IsPacked = false); bool isPacked() const { return cast(LLVMTy)->isPacked(); } @@ -325,13 +326,13 @@ class StructType : public Type { class VectorType : public Type { public: - static VectorType *get(Type *ElementType, ElementCount EC); + LLVM_ABI static VectorType *get(Type *ElementType, ElementCount EC); static VectorType *get(Type *ElementType, unsigned NumElements, bool Scalable) { return VectorType::get(ElementType, ElementCount::get(NumElements, Scalable)); } - Type *getElementType() const; + LLVM_ABI Type *getElementType() const; static VectorType *get(Type *ElementType, const VectorType *Other) { return VectorType::get(ElementType, Other->getElementCount()); @@ -340,13 +341,14 @@ class VectorType : public Type { inline ElementCount getElementCount() const { return cast(LLVMTy)->getElementCount(); } - static VectorType *getInteger(VectorType *VTy); - static VectorType *getExtendedElementVectorType(VectorType *VTy); - static VectorType *getTruncatedElementVectorType(VectorType *VTy); - static VectorType *getSubdividedVectorType(VectorType *VTy, int NumSubdivs); - static VectorType *getHalfElementsVectorType(VectorType *VTy); - static VectorType *getDoubleElementsVectorType(VectorType *VTy); - static bool isValidElementType(Type *ElemTy); + LLVM_ABI static VectorType *getInteger(VectorType *VTy); + LLVM_ABI static VectorType *getExtendedElementVectorType(VectorType *VTy); + LLVM_ABI static VectorType *getTruncatedElementVectorType(VectorType *VTy); + LLVM_ABI static VectorType *getSubdividedVectorType(VectorType *VTy, + int NumSubdivs); + LLVM_ABI static VectorType *getHalfElementsVectorType(VectorType *VTy); + LLVM_ABI static VectorType *getDoubleElementsVectorType(VectorType *VTy); + LLVM_ABI static bool isValidElementType(Type *ElemTy); static bool classof(const Type *From) { return isa(From->LLVMTy); @@ -355,7 +357,7 @@ class VectorType : public Type { class FixedVectorType : public VectorType { public: - static FixedVectorType *get(Type *ElementType, unsigned NumElts); + LLVM_ABI static FixedVectorType *get(Type *ElementType, unsigned NumElts); static FixedVectorType *get(Type *ElementType, const FixedVectorType *FVTy) { return get(ElementType, FVTy->getNumElements()); @@ -399,7 +401,8 @@ class FixedVectorType : public VectorType { class ScalableVectorType : public VectorType { public: - static ScalableVectorType *get(Type *ElementType, unsigned MinNumElts); + LLVM_ABI static ScalableVectorType *get(Type *ElementType, + unsigned MinNumElts); static ScalableVectorType *get(Type *ElementType, const ScalableVectorType *SVTy) { @@ -462,7 +465,7 @@ class FunctionType : public Type { /// Integer representation type class IntegerType : public Type { public: - static IntegerType *get(Context &C, unsigned NumBits); + LLVM_ABI static IntegerType *get(Context &C, unsigned NumBits); // TODO: add missing functions static bool classof(const Type *From) { return isa(From->LLVMTy); diff --git a/llvm/include/llvm/SandboxIR/Use.h b/llvm/include/llvm/SandboxIR/Use.h index c4a774aa3a89e..5c02c4f2b3495 100644 --- a/llvm/include/llvm/SandboxIR/Use.h +++ b/llvm/include/llvm/SandboxIR/Use.h @@ -14,6 +14,7 @@ #define LLVM_SANDBOXIR_USE_H #include "llvm/IR/Use.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/raw_ostream.h" namespace llvm::sandboxir { @@ -49,11 +50,11 @@ class Use { public: operator Value *() const { return get(); } - Value *get() const; - void set(Value *V); + LLVM_ABI Value *get() const; + LLVM_ABI void set(Value *V); class User *getUser() const { return Usr; } - unsigned getOperandNo() const; - void swap(Use &OtherUse); + LLVM_ABI unsigned getOperandNo() const; + LLVM_ABI void swap(Use &OtherUse); Context *getContext() const { return Ctx; } bool operator==(const Use &Other) const { assert(Ctx == Other.Ctx && "Contexts differ!"); diff --git a/llvm/include/llvm/SandboxIR/User.h b/llvm/include/llvm/SandboxIR/User.h index 80e672de34905..c552e2e3378be 100644 --- a/llvm/include/llvm/SandboxIR/User.h +++ b/llvm/include/llvm/SandboxIR/User.h @@ -13,6 +13,7 @@ #include "llvm/IR/Value.h" #include "llvm/SandboxIR/Use.h" #include "llvm/SandboxIR/Value.h" +#include "llvm/Support/Compiler.h" namespace llvm::sandboxir { @@ -36,8 +37,8 @@ class OperandUseIterator { using iterator_category = std::input_iterator_tag; OperandUseIterator() = default; - value_type operator*() const; - OperandUseIterator &operator++(); + LLVM_ABI value_type operator*() const; + LLVM_ABI OperandUseIterator &operator++(); OperandUseIterator operator++(int) { auto Copy = *this; this->operator++(); @@ -49,13 +50,13 @@ class OperandUseIterator { bool operator!=(const OperandUseIterator &Other) const { return !(*this == Other); } - OperandUseIterator operator+(unsigned Num) const; - OperandUseIterator operator-(unsigned Num) const; - int operator-(const OperandUseIterator &Other) const; + LLVM_ABI OperandUseIterator operator+(unsigned Num) const; + LLVM_ABI OperandUseIterator operator-(unsigned Num) const; + LLVM_ABI int operator-(const OperandUseIterator &Other) const; }; /// A sandboxir::User has operands. -class User : public Value { +class LLVM_ABI User : public Value { protected: User(ClassID ID, llvm::Value *V, Context &Ctx) : Value(ID, V, Ctx) {} diff --git a/llvm/include/llvm/SandboxIR/Value.h b/llvm/include/llvm/SandboxIR/Value.h index dbd0208b4f3f3..dd0bc76db3e37 100644 --- a/llvm/include/llvm/SandboxIR/Value.h +++ b/llvm/include/llvm/SandboxIR/Value.h @@ -12,6 +12,7 @@ #include "llvm/IR/Metadata.h" #include "llvm/IR/Value.h" #include "llvm/SandboxIR/Use.h" +#include "llvm/Support/Compiler.h" namespace llvm::sandboxir { @@ -50,7 +51,7 @@ class UserUseIterator { UserUseIterator() = default; value_type operator*() const { return Use; } - UserUseIterator &operator++(); + LLVM_ABI UserUseIterator &operator++(); bool operator==(const UserUseIterator &Other) const { return Use == Other.Use; } @@ -179,7 +180,7 @@ class Value { void clearValue() { Val = nullptr; } template friend class LLVMOpUserItToSBTy; - Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx); + LLVM_ABI Value(ClassID SubclassID, llvm::Value *Val, Context &Ctx); /// Disable copies. Value(const Value &) = delete; Value &operator=(const Value &) = delete; @@ -191,7 +192,7 @@ class Value { using use_iterator = UserUseIterator; using const_use_iterator = UserUseIterator; - use_iterator use_begin(); + LLVM_ABI use_iterator use_begin(); const_use_iterator use_begin() const { return const_cast(this)->use_begin(); } @@ -215,7 +216,7 @@ class Value { using user_iterator = mapped_iterator; using const_user_iterator = user_iterator; - user_iterator user_begin(); + LLVM_ABI user_iterator user_begin(); user_iterator user_end() { return user_iterator(Use(nullptr, nullptr, Ctx), UseToUser()); } @@ -234,7 +235,7 @@ class Value { } /// \Returns the number of user edges (not necessarily to unique users). /// WARNING: This is a linear-time operation. - unsigned getNumUses() const; + LLVM_ABI unsigned getNumUses() const; /// Return true if this value has N uses or more. /// This is logically equivalent to getNumUses() >= N. /// WARNING: This can be expensive, as it is linear to the number of users. @@ -256,13 +257,14 @@ class Value { return Cnt == Num; } - Type *getType() const; + LLVM_ABI Type *getType() const; Context &getContext() const { return Ctx; } - void replaceUsesWithIf(Value *OtherV, - llvm::function_ref ShouldReplace); - void replaceAllUsesWith(Value *Other); + LLVM_ABI void + replaceUsesWithIf(Value *OtherV, + llvm::function_ref ShouldReplace); + LLVM_ABI void replaceAllUsesWith(Value *Other); /// \Returns the LLVM IR name of the bottom-most LLVM value. StringRef getName() const { return Val->getName(); } diff --git a/llvm/lib/SandboxIR/Constant.cpp b/llvm/lib/SandboxIR/Constant.cpp index 82cf0876d5800..9de88ef2cf0a0 100644 --- a/llvm/lib/SandboxIR/Constant.cpp +++ b/llvm/lib/SandboxIR/Constant.cpp @@ -305,35 +305,14 @@ GlobalT &GlobalWithNodeAPI:: } // Explicit instantiations. -template class GlobalWithNodeAPI; -template class GlobalWithNodeAPI; -template class GlobalWithNodeAPI; -template class GlobalWithNodeAPI; - -#if defined(_MSC_VER) && !defined(__clang__) -// These are needed for SandboxIRTest when building with LLVM_BUILD_LLVM_DYLIB -template LLVM_EXPORT_TEMPLATE GlobalIFunc & -GlobalWithNodeAPI::LLVMGVToGV::operator()(llvm::GlobalIFunc - &LLVMGV) - const; -template LLVM_EXPORT_TEMPLATE Function & -GlobalWithNodeAPI:: - LLVMGVToGV::operator()(llvm::Function &LLVMGV) const; - -template LLVM_EXPORT_TEMPLATE GlobalVariable &GlobalWithNodeAPI< - GlobalVariable, llvm::GlobalVariable, GlobalObject, - llvm::GlobalObject>::LLVMGVToGV::operator()(llvm::GlobalVariable &LLVMGV) - const; -template LLVM_EXPORT_TEMPLATE GlobalAlias & -GlobalWithNodeAPI::LLVMGVToGV::operator()(llvm::GlobalAlias - &LLVMGV) const; -#endif +template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI< + GlobalIFunc, llvm::GlobalIFunc, GlobalObject, llvm::GlobalObject>; +template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI< + Function, llvm::Function, GlobalObject, llvm::GlobalObject>; +template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI< + GlobalVariable, llvm::GlobalVariable, GlobalObject, llvm::GlobalObject>; +template class LLVM_EXPORT_TEMPLATE GlobalWithNodeAPI< + GlobalAlias, llvm::GlobalAlias, GlobalValue, llvm::GlobalValue>; void GlobalIFunc::setResolver(Constant *Resolver) { Ctx.getTracker() From 2652d1b2fd65950a66f37ed6d5ed9c4ffabacbee Mon Sep 17 00:00:00 2001 From: Andrew Rogers Date: Wed, 11 Jun 2025 09:19:47 -0700 Subject: [PATCH 100/851] [llvm] annotate interfaces in llvm/TextAPI for DLL export (#143447) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Purpose This patch is one in a series of code-mods that annotate LLVM’s public interface for export. This patch annotates the `llvm/TextAPI` library. These annotations currently have no meaningful impact on the LLVM build; however, they are a prerequisite to support an LLVM Windows DLL (shared library) build. ## Background This effort is tracked in #109483. Additional context is provided in [this discourse](https://discourse.llvm.org/t/psa-annotating-llvm-public-interface/85307), and documentation for `LLVM_ABI` and related annotations is found in the LLVM repo [here](https://github.com/llvm/llvm-project/blob/main/llvm/docs/InterfaceExportAnnotations.rst). These changes were generated automatically using the [Interface Definition Scanner (IDS)](https://github.com/compnerd/ids) tool, followed formatting with `git clang-format`. ## Validation Local builds and tests to validate cross-platform compatibility. This included llvm, clang, and lldb on the following configurations: - Windows with MSVC - Windows with Clang - Linux with GCC - Linux with Clang - Darwin with Clang --- llvm/include/llvm/TextAPI/Architecture.h | 17 ++++--- llvm/include/llvm/TextAPI/ArchitectureSet.h | 13 +++--- llvm/include/llvm/TextAPI/DylibReader.h | 10 ++-- llvm/include/llvm/TextAPI/InterfaceFile.h | 34 +++++++------- llvm/include/llvm/TextAPI/PackedVersion.h | 9 ++-- llvm/include/llvm/TextAPI/Platform.h | 17 +++---- llvm/include/llvm/TextAPI/Record.h | 18 +++---- llvm/include/llvm/TextAPI/RecordVisitor.h | 5 +- llvm/include/llvm/TextAPI/RecordsSlice.h | 52 ++++++++++++--------- llvm/include/llvm/TextAPI/Symbol.h | 8 ++-- llvm/include/llvm/TextAPI/SymbolSet.h | 11 +++-- llvm/include/llvm/TextAPI/Target.h | 15 +++--- llvm/include/llvm/TextAPI/TextAPIError.h | 3 +- llvm/include/llvm/TextAPI/TextAPIReader.h | 5 +- llvm/include/llvm/TextAPI/TextAPIWriter.h | 8 ++-- llvm/include/llvm/TextAPI/Utils.h | 21 +++++---- 16 files changed, 138 insertions(+), 108 deletions(-) diff --git a/llvm/include/llvm/TextAPI/Architecture.h b/llvm/include/llvm/TextAPI/Architecture.h index 978359995074b..7a7f5416fe7c7 100644 --- a/llvm/include/llvm/TextAPI/Architecture.h +++ b/llvm/include/llvm/TextAPI/Architecture.h @@ -13,6 +13,7 @@ #ifndef LLVM_TEXTAPI_ARCHITECTURE_H #define LLVM_TEXTAPI_ARCHITECTURE_H +#include "llvm/Support/Compiler.h" #include #include @@ -32,24 +33,26 @@ enum Architecture : uint8_t { }; /// Convert a CPU Type and Subtype pair to an architecture slice. -Architecture getArchitectureFromCpuType(uint32_t CPUType, uint32_t CPUSubType); +LLVM_ABI Architecture getArchitectureFromCpuType(uint32_t CPUType, + uint32_t CPUSubType); /// Convert a name to an architecture slice. -Architecture getArchitectureFromName(StringRef Name); +LLVM_ABI Architecture getArchitectureFromName(StringRef Name); /// Convert an architecture slice to a string. -StringRef getArchitectureName(Architecture Arch); +LLVM_ABI StringRef getArchitectureName(Architecture Arch); /// Convert an architecture slice to a CPU Type and Subtype pair. -std::pair getCPUTypeFromArchitecture(Architecture Arch); +LLVM_ABI std::pair +getCPUTypeFromArchitecture(Architecture Arch); /// Convert a target to an architecture slice. -Architecture mapToArchitecture(const llvm::Triple &Target); +LLVM_ABI Architecture mapToArchitecture(const llvm::Triple &Target); /// Check if architecture is 64 bit. -bool is64Bit(Architecture); +LLVM_ABI bool is64Bit(Architecture); -raw_ostream &operator<<(raw_ostream &OS, Architecture Arch); +LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, Architecture Arch); } // end namespace MachO. } // end namespace llvm. diff --git a/llvm/include/llvm/TextAPI/ArchitectureSet.h b/llvm/include/llvm/TextAPI/ArchitectureSet.h index 2cce9dbf0d80c..a7d3394c99821 100644 --- a/llvm/include/llvm/TextAPI/ArchitectureSet.h +++ b/llvm/include/llvm/TextAPI/ArchitectureSet.h @@ -13,6 +13,7 @@ #ifndef LLVM_TEXTAPI_ARCHITECTURESET_H #define LLVM_TEXTAPI_ARCHITECTURESET_H +#include "llvm/Support/Compiler.h" #include "llvm/TextAPI/Architecture.h" #include #include @@ -38,7 +39,7 @@ class ArchitectureSet { constexpr ArchitectureSet() = default; constexpr ArchitectureSet(ArchSetType Raw) : ArchSet(Raw) {} ArchitectureSet(Architecture Arch) : ArchitectureSet() { set(Arch); } - ArchitectureSet(const std::vector &Archs); + LLVM_ABI ArchitectureSet(const std::vector &Archs); static ArchitectureSet All() { return ArchitectureSet(EndIndexVal); } @@ -61,7 +62,7 @@ class ArchitectureSet { return (ArchSet & Archs.ArchSet) == Archs.ArchSet; } - size_t count() const; + LLVM_ABI size_t count() const; bool empty() const { return ArchSet == 0; } @@ -158,9 +159,9 @@ class ArchitectureSet { const_iterator begin() const { return {&ArchSet}; } const_iterator end() const { return {&ArchSet, EndIndexVal}; } - operator std::string() const; - operator std::vector() const; - void print(raw_ostream &OS) const; + LLVM_ABI operator std::string() const; + LLVM_ABI operator std::vector() const; + LLVM_ABI void print(raw_ostream &OS) const; }; inline ArchitectureSet operator|(const Architecture &lhs, @@ -168,7 +169,7 @@ inline ArchitectureSet operator|(const Architecture &lhs, return ArchitectureSet(lhs) | ArchitectureSet(rhs); } -raw_ostream &operator<<(raw_ostream &OS, ArchitectureSet Set); +LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, ArchitectureSet Set); } // end namespace MachO. } // end namespace llvm. diff --git a/llvm/include/llvm/TextAPI/DylibReader.h b/llvm/include/llvm/TextAPI/DylibReader.h index 6861d3cb1591b..f3a806d78df78 100644 --- a/llvm/include/llvm/TextAPI/DylibReader.h +++ b/llvm/include/llvm/TextAPI/DylibReader.h @@ -14,6 +14,7 @@ #define LLVM_TEXTAPI_DYLIBREADER_H #include "llvm/ADT/StringMap.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/TextAPI/ArchitectureSet.h" @@ -37,20 +38,21 @@ struct ParseOption { /// \param Buffer Data that points to dylib. /// \param Options Determines which attributes to extract. /// \return List of record slices. -Expected readFile(MemoryBufferRef Buffer, const ParseOption &Opt); +LLVM_ABI Expected readFile(MemoryBufferRef Buffer, + const ParseOption &Opt); /// Get TAPI file representation of binary dylib. /// /// \param Buffer Data that points to dylib. -Expected> get(MemoryBufferRef Buffer); +LLVM_ABI Expected> get(MemoryBufferRef Buffer); using SymbolToSourceLocMap = llvm::StringMap; /// Get the source location for each symbol from dylib. /// /// \param DSYM Path to DSYM file. /// \param T Requested target slice for dylib. -SymbolToSourceLocMap accumulateSourceLocFromDSYM(const StringRef DSYM, - const Target &T); +LLVM_ABI SymbolToSourceLocMap accumulateSourceLocFromDSYM(const StringRef DSYM, + const Target &T); } // namespace llvm::MachO::DylibReader diff --git a/llvm/include/llvm/TextAPI/InterfaceFile.h b/llvm/include/llvm/TextAPI/InterfaceFile.h index 23c27cb0f4745..747c8d0a208c5 100644 --- a/llvm/include/llvm/TextAPI/InterfaceFile.h +++ b/llvm/include/llvm/TextAPI/InterfaceFile.h @@ -18,6 +18,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/iterator.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/Compiler.h" #include "llvm/TextAPI/ArchitectureSet.h" #include "llvm/TextAPI/FileTypes.h" #include "llvm/TextAPI/PackedVersion.h" @@ -60,7 +61,7 @@ class InterfaceFileRef { StringRef getInstallName() const { return InstallName; }; - void addTarget(const Target &Target); + LLVM_ABI void addTarget(const Target &Target); template void addTargets(RangeT &&Targets) { for (const auto &Target : Targets) addTarget(Target(Target)); @@ -146,7 +147,7 @@ class InterfaceFile { /// Set and add target. /// /// \param Target the target to add into. - void addTarget(const Target &Target); + LLVM_ABI void addTarget(const Target &Target); /// Determine if target triple slice exists in file. /// @@ -174,7 +175,7 @@ class InterfaceFile { std::function>; using const_filtered_target_range = llvm::iterator_range; - const_filtered_target_range targets(ArchitectureSet Archs) const; + LLVM_ABI const_filtered_target_range targets(ArchitectureSet Archs) const; /// Set the install name of the library. void setInstallName(StringRef InstallName_) { @@ -241,7 +242,7 @@ class InterfaceFile { /// Set the parent umbrella frameworks. /// \param Target_ The target applicable to Parent /// \param Parent The name of Parent - void addParentUmbrella(const Target &Target_, StringRef Parent); + LLVM_ABI void addParentUmbrella(const Target &Target_, StringRef Parent); /// Get the list of Parent Umbrella frameworks. /// @@ -261,7 +262,7 @@ class InterfaceFile { /// \param InstallName The name of the client that is allowed to link this /// library. /// \param Target The target triple for which this applies. - void addAllowableClient(StringRef InstallName, const Target &Target); + LLVM_ABI void addAllowableClient(StringRef InstallName, const Target &Target); /// Get the list of allowable clients. /// @@ -274,7 +275,8 @@ class InterfaceFile { /// /// \param InstallName The name of the library to re-export. /// \param Target The target triple for which this applies. - void addReexportedLibrary(StringRef InstallName, const Target &Target); + LLVM_ABI void addReexportedLibrary(StringRef InstallName, + const Target &Target); /// Get the list of re-exported libraries. /// @@ -286,7 +288,7 @@ class InterfaceFile { /// Add a library for inlining to top level library. /// ///\param Document The library to inline with top level library. - void addDocument(std::shared_ptr &&Document); + LLVM_ABI void addDocument(std::shared_ptr &&Document); /// Returns the pointer to parent document if exists or nullptr otherwise. InterfaceFile *getParent() const { return Parent; } @@ -301,7 +303,7 @@ class InterfaceFile { /// Set the runpath search paths. /// \param RPath The name of runpath. /// \param InputTarget The target applicable to runpath search path. - void addRPath(StringRef RPath, const Target &InputTarget); + LLVM_ABI void addRPath(StringRef RPath, const Target &InputTarget); /// Get the list of runpath search paths. /// @@ -373,14 +375,14 @@ class InterfaceFile { /// /// \param Arch architecture to extract from. /// \return New InterfaceFile with extracted architecture slice. - llvm::Expected> + LLVM_ABI llvm::Expected> extract(Architecture Arch) const; /// Remove architecture slice from Interface. /// /// \param Arch architecture to remove. /// \return New Interface File with removed architecture slice. - llvm::Expected> + LLVM_ABI llvm::Expected> remove(Architecture Arch) const; /// Merge Interfaces for the same library. The following library attributes @@ -390,29 +392,29 @@ class InterfaceFile { /// /// \param O The Interface to merge. /// \return New Interface File that was merged. - llvm::Expected> + LLVM_ABI llvm::Expected> merge(const InterfaceFile *O) const; /// Inline reexported library into Interface. /// /// \param Library Interface of reexported library. /// \param Overwrite Whether to overwrite preexisting inlined library. - void inlineLibrary(std::shared_ptr Library, - bool Overwrite = false); + LLVM_ABI void inlineLibrary(std::shared_ptr Library, + bool Overwrite = false); /// Set InterfaceFile properties from pre-gathered binary attributes, /// if they are not set already. /// /// \param BA Attributes typically represented in load commands. /// \param Targ MachO Target slice to add attributes to. - void setFromBinaryAttrs(const RecordsSlice::BinaryAttrs &BA, - const Target &Targ); + LLVM_ABI void setFromBinaryAttrs(const RecordsSlice::BinaryAttrs &BA, + const Target &Targ); /// The equality is determined by attributes that impact linking /// compatibilities. Path, & FileKind are irrelevant since these by /// itself should not impact linking. /// This is an expensive operation. - bool operator==(const InterfaceFile &O) const; + LLVM_ABI bool operator==(const InterfaceFile &O) const; bool operator!=(const InterfaceFile &O) const { return !(*this == O); } diff --git a/llvm/include/llvm/TextAPI/PackedVersion.h b/llvm/include/llvm/TextAPI/PackedVersion.h index e680d40c71044..cabe365e6d97a 100644 --- a/llvm/include/llvm/TextAPI/PackedVersion.h +++ b/llvm/include/llvm/TextAPI/PackedVersion.h @@ -13,6 +13,7 @@ #ifndef LLVM_TEXTAPI_PACKEDVERSION_H #define LLVM_TEXTAPI_PACKEDVERSION_H +#include "llvm/Support/Compiler.h" #include "llvm/Support/VersionTuple.h" #include #include @@ -53,8 +54,8 @@ class PackedVersion { /// Retrieve the subminor version number, if provided. unsigned getSubminor() const { return Version & 0xff; } - bool parse32(StringRef Str); - std::pair parse64(StringRef Str); + LLVM_ABI bool parse32(StringRef Str); + LLVM_ABI std::pair parse64(StringRef Str); bool operator<(const PackedVersion &O) const { return Version < O.Version; } @@ -64,9 +65,9 @@ class PackedVersion { uint32_t rawValue() const { return Version; } - operator std::string() const; + LLVM_ABI operator std::string() const; - void print(raw_ostream &OS) const; + LLVM_ABI void print(raw_ostream &OS) const; }; inline raw_ostream &operator<<(raw_ostream &OS, const PackedVersion &Version) { diff --git a/llvm/include/llvm/TextAPI/Platform.h b/llvm/include/llvm/TextAPI/Platform.h index d828d9ac49f65..8ea187acc02f9 100644 --- a/llvm/include/llvm/TextAPI/Platform.h +++ b/llvm/include/llvm/TextAPI/Platform.h @@ -14,6 +14,7 @@ #include "llvm/ADT/SmallSet.h" #include "llvm/BinaryFormat/MachO.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/VersionTuple.h" namespace llvm { @@ -22,14 +23,14 @@ namespace MachO { using PlatformSet = SmallSet; using PlatformVersionSet = SmallSet, 3>; -PlatformType mapToPlatformType(PlatformType Platform, bool WantSim); -PlatformType mapToPlatformType(const Triple &Target); -PlatformSet mapToPlatformSet(ArrayRef Targets); -StringRef getPlatformName(PlatformType Platform); -PlatformType getPlatformFromName(StringRef Name); -std::string getOSAndEnvironmentName(PlatformType Platform, - std::string Version = ""); -VersionTuple mapToSupportedOSVersion(const Triple &Triple); +LLVM_ABI PlatformType mapToPlatformType(PlatformType Platform, bool WantSim); +LLVM_ABI PlatformType mapToPlatformType(const Triple &Target); +LLVM_ABI PlatformSet mapToPlatformSet(ArrayRef Targets); +LLVM_ABI StringRef getPlatformName(PlatformType Platform); +LLVM_ABI PlatformType getPlatformFromName(StringRef Name); +LLVM_ABI std::string getOSAndEnvironmentName(PlatformType Platform, + std::string Version = ""); +LLVM_ABI VersionTuple mapToSupportedOSVersion(const Triple &Triple); } // end namespace MachO. } // end namespace llvm. diff --git a/llvm/include/llvm/TextAPI/Record.h b/llvm/include/llvm/TextAPI/Record.h index 7d721988ec3da..6e470d97325fd 100644 --- a/llvm/include/llvm/TextAPI/Record.h +++ b/llvm/include/llvm/TextAPI/Record.h @@ -17,6 +17,7 @@ #include "llvm/ADT/MapVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" #include "llvm/TextAPI/Symbol.h" #include @@ -104,7 +105,7 @@ class Record { SymbolFlags getFlags() const { return Flags; } private: - SymbolFlags mergeFlags(SymbolFlags Flags, RecordLinkage Linkage); + LLVM_ABI SymbolFlags mergeFlags(SymbolFlags Flags, RecordLinkage Linkage); protected: StringRef Name; @@ -164,9 +165,9 @@ class ObjCContainerRecord : public Record { ObjCContainerRecord(StringRef Name, RecordLinkage Linkage) : Record({Name, Linkage, SymbolFlags::Data}) {} - ObjCIVarRecord *addObjCIVar(StringRef IVar, RecordLinkage Linkage); - ObjCIVarRecord *findObjCIVar(StringRef IVar) const; - std::vector getObjCIVars() const; + LLVM_ABI ObjCIVarRecord *addObjCIVar(StringRef IVar, RecordLinkage Linkage); + LLVM_ABI ObjCIVarRecord *findObjCIVar(StringRef IVar) const; + LLVM_ABI std::vector getObjCIVars() const; RecordLinkage getLinkage() const { return Linkage; } private: @@ -207,11 +208,12 @@ class ObjCInterfaceRecord : public ObjCContainerRecord { return getLinkageForSymbol(CurrType) >= RecordLinkage::Rexported; } - RecordLinkage getLinkageForSymbol(ObjCIFSymbolKind CurrType) const; - void updateLinkageForSymbols(ObjCIFSymbolKind SymType, RecordLinkage Link); + LLVM_ABI RecordLinkage getLinkageForSymbol(ObjCIFSymbolKind CurrType) const; + LLVM_ABI void updateLinkageForSymbols(ObjCIFSymbolKind SymType, + RecordLinkage Link); - bool addObjCCategory(ObjCCategoryRecord *Record); - std::vector getObjCCategories() const; + LLVM_ABI bool addObjCCategory(ObjCCategoryRecord *Record); + LLVM_ABI std::vector getObjCCategories() const; private: /// Linkage level for each symbol represented in ObjCInterfaceRecord. diff --git a/llvm/include/llvm/TextAPI/RecordVisitor.h b/llvm/include/llvm/TextAPI/RecordVisitor.h index 34e43f5b0027f..65bc96df244d7 100644 --- a/llvm/include/llvm/TextAPI/RecordVisitor.h +++ b/llvm/include/llvm/TextAPI/RecordVisitor.h @@ -13,6 +13,7 @@ #ifndef LLVM_TEXTAPI_RECORDVISITOR_H #define LLVM_TEXTAPI_RECORDVISITOR_H +#include "llvm/Support/Compiler.h" #include "llvm/TextAPI/Record.h" #include "llvm/TextAPI/SymbolSet.h" @@ -20,7 +21,7 @@ namespace llvm { namespace MachO { /// Base class for any usage of traversing over collected Records. -class RecordVisitor { +class LLVM_ABI RecordVisitor { public: virtual ~RecordVisitor(); @@ -32,7 +33,7 @@ class RecordVisitor { /// Specialized RecordVisitor for collecting exported symbols /// and undefined symbols if RecordSlice being visited represents a /// flat-namespaced library. -class SymbolConverter : public RecordVisitor { +class LLVM_ABI SymbolConverter : public RecordVisitor { public: SymbolConverter(SymbolSet *Symbols, const Target &T, const bool RecordUndefs = false) diff --git a/llvm/include/llvm/TextAPI/RecordsSlice.h b/llvm/include/llvm/TextAPI/RecordsSlice.h index f934cf7607f1f..6ecb79a115aea 100644 --- a/llvm/include/llvm/TextAPI/RecordsSlice.h +++ b/llvm/include/llvm/TextAPI/RecordsSlice.h @@ -15,6 +15,7 @@ #define LLVM_TEXTAPI_RECORDSLICE_H #include "llvm/Support/Allocator.h" +#include "llvm/Support/Compiler.h" #include "llvm/TextAPI/FileTypes.h" #include "llvm/TextAPI/PackedVersion.h" #include "llvm/TextAPI/Record.h" @@ -43,9 +44,10 @@ class RecordsSlice { /// symbol. /// \param Linkage The linkage of symbol. /// \return The non-owning pointer to added record in slice. - Record *addRecord(StringRef Name, SymbolFlags Flags, - GlobalRecord::Kind GV = GlobalRecord::Kind::Unknown, - RecordLinkage Linkage = RecordLinkage::Unknown); + LLVM_ABI Record * + addRecord(StringRef Name, SymbolFlags Flags, + GlobalRecord::Kind GV = GlobalRecord::Kind::Unknown, + RecordLinkage Linkage = RecordLinkage::Unknown); /// Add non-ObjC global record. /// @@ -56,10 +58,10 @@ class RecordsSlice { /// \param Inlined Whether declaration is inlined, only applicable to /// functions. /// \return The non-owning pointer to added record in slice. - GlobalRecord *addGlobal(StringRef Name, RecordLinkage Linkage, - GlobalRecord::Kind GV, - SymbolFlags Flags = SymbolFlags::None, - bool Inlined = false); + LLVM_ABI GlobalRecord *addGlobal(StringRef Name, RecordLinkage Linkage, + GlobalRecord::Kind GV, + SymbolFlags Flags = SymbolFlags::None, + bool Inlined = false); /// Add ObjC Class record. /// @@ -67,8 +69,9 @@ class RecordsSlice { /// \param Linkage The linkage of symbol. /// \param SymType The symbols this class represents. /// \return The non-owning pointer to added record in slice. - ObjCInterfaceRecord *addObjCInterface(StringRef Name, RecordLinkage Linkage, - ObjCIFSymbolKind SymType); + LLVM_ABI ObjCInterfaceRecord *addObjCInterface(StringRef Name, + RecordLinkage Linkage, + ObjCIFSymbolKind SymType); /// Add ObjC IVar record. /// @@ -76,8 +79,8 @@ class RecordsSlice { /// \param Name The name of ivar, not symbol. /// \param Linkage The linkage of symbol. /// \return The non-owning pointer to added record in slice. - ObjCIVarRecord *addObjCIVar(ObjCContainerRecord *Container, StringRef Name, - RecordLinkage Linkage); + LLVM_ABI ObjCIVarRecord *addObjCIVar(ObjCContainerRecord *Container, + StringRef Name, RecordLinkage Linkage); /// Add ObjC Category record. /// @@ -85,22 +88,22 @@ class RecordsSlice { /// category, not symbol. /// \param Category The name of category. /// \return The non-owning pointer to added record in slice. - ObjCCategoryRecord *addObjCCategory(StringRef ClassToExtend, - StringRef Category); + LLVM_ABI ObjCCategoryRecord *addObjCCategory(StringRef ClassToExtend, + StringRef Category); /// Find ObjC Class. /// /// \param Name name of class, not full symbol name. /// \return The non-owning pointer to record in slice. - ObjCInterfaceRecord *findObjCInterface(StringRef Name) const; + LLVM_ABI ObjCInterfaceRecord *findObjCInterface(StringRef Name) const; /// Find ObjC Category. /// /// \param ClassToExtend The name of class, not full symbol name. /// \param Category The name of category. /// \return The non-owning pointer to record in slice. - ObjCCategoryRecord *findObjCCategory(StringRef ClassToExtend, - StringRef Category) const; + LLVM_ABI ObjCCategoryRecord *findObjCCategory(StringRef ClassToExtend, + StringRef Category) const; /// Find ObjC Container. This is commonly used for assigning for looking up /// instance variables that are assigned to either a category or class. @@ -110,21 +113,23 @@ class RecordsSlice { /// \param Name Either the name of ivar or name of container. /// \return The non-owning pointer to record in /// slice. - ObjCContainerRecord *findContainer(bool IsIVar, StringRef Name) const; + LLVM_ABI ObjCContainerRecord *findContainer(bool IsIVar, + StringRef Name) const; /// Find ObjC instance variable. /// /// \param IsScopedName This is used to determine how to parse the name. /// \param Name Either the full name of the symbol or just the ivar. /// \return The non-owning pointer to record in slice. - ObjCIVarRecord *findObjCIVar(bool IsScopedName, StringRef Name) const; + LLVM_ABI ObjCIVarRecord *findObjCIVar(bool IsScopedName, + StringRef Name) const; /// Find non-objc global. /// /// \param Name The name of symbol. /// \param GV The Kind of global to find. /// \return The non-owning pointer to record in slice. - GlobalRecord * + LLVM_ABI GlobalRecord * findGlobal(StringRef Name, GlobalRecord::Kind GV = GlobalRecord::Kind::Unknown) const; @@ -138,7 +143,7 @@ class RecordsSlice { } // Visit all records known to RecordsSlice. - void visit(RecordVisitor &V) const; + LLVM_ABI void visit(RecordVisitor &V) const; struct BinaryAttrs { std::vector AllowableClients; @@ -158,11 +163,11 @@ class RecordsSlice { }; /// Return reference to BinaryAttrs. - BinaryAttrs &getBinaryAttrs(); + LLVM_ABI BinaryAttrs &getBinaryAttrs(); /// Store any strings owned by RecordSlice into allocator and return back /// reference to that. - StringRef copyString(StringRef String); + LLVM_ABI StringRef copyString(StringRef String); private: const llvm::Triple TargetTriple; @@ -196,7 +201,8 @@ class RecordsSlice { using Records = llvm::SmallVector, 4>; class InterfaceFile; -std::unique_ptr convertToInterfaceFile(const Records &Slices); +LLVM_ABI std::unique_ptr +convertToInterfaceFile(const Records &Slices); } // namespace MachO } // namespace llvm diff --git a/llvm/include/llvm/TextAPI/Symbol.h b/llvm/include/llvm/TextAPI/Symbol.h index 5a5eb0eb48325..92ff0746f7995 100644 --- a/llvm/include/llvm/TextAPI/Symbol.h +++ b/llvm/include/llvm/TextAPI/Symbol.h @@ -11,6 +11,7 @@ #include "llvm/ADT/BitmaskEnum.h" #include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TextAPI/ArchitectureSet.h" #include "llvm/TextAPI/Target.h" @@ -152,14 +153,15 @@ class Symbol { std::function>; using const_filtered_target_range = llvm::iterator_range; - const_filtered_target_range targets(ArchitectureSet architectures) const; + LLVM_ABI const_filtered_target_range + targets(ArchitectureSet architectures) const; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void dump(raw_ostream &OS) const; void dump() const { dump(llvm::errs()); } #endif - bool operator==(const Symbol &O) const; + LLVM_ABI bool operator==(const Symbol &O) const; bool operator!=(const Symbol &O) const { return !(*this == O); } @@ -189,7 +191,7 @@ struct SimpleSymbol { /// Get symbol classification by parsing the name of a symbol. /// /// \param SymName The name of symbol. -SimpleSymbol parseSymbol(StringRef SymName); +LLVM_ABI SimpleSymbol parseSymbol(StringRef SymName); } // end namespace MachO. } // end namespace llvm. diff --git a/llvm/include/llvm/TextAPI/SymbolSet.h b/llvm/include/llvm/TextAPI/SymbolSet.h index 6ccabb9077208..cd3066317f3ae 100644 --- a/llvm/include/llvm/TextAPI/SymbolSet.h +++ b/llvm/include/llvm/TextAPI/SymbolSet.h @@ -15,6 +15,7 @@ #include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/Support/Allocator.h" +#include "llvm/Support/Compiler.h" #include "llvm/TextAPI/Architecture.h" #include "llvm/TextAPI/ArchitectureSet.h" #include "llvm/TextAPI/Symbol.h" @@ -87,12 +88,12 @@ class SymbolSet { using SymbolsMapType = llvm::DenseMap; SymbolsMapType Symbols; - Symbol *addGlobalImpl(EncodeKind, StringRef Name, SymbolFlags Flags); + LLVM_ABI Symbol *addGlobalImpl(EncodeKind, StringRef Name, SymbolFlags Flags); public: SymbolSet() = default; - Symbol *addGlobal(EncodeKind Kind, StringRef Name, SymbolFlags Flags, - const Target &Targ); + LLVM_ABI Symbol *addGlobal(EncodeKind Kind, StringRef Name, SymbolFlags Flags, + const Target &Targ); size_t size() const { return Symbols.size(); } template create(StringRef Target); + LLVM_ABI static llvm::Expected create(StringRef Target); - operator std::string() const; + LLVM_ABI operator std::string() const; Architecture Arch; PlatformType Platform; @@ -66,13 +67,13 @@ inline bool operator!=(const Target &LHS, const Architecture &RHS) { return LHS.Arch != RHS; } -PlatformVersionSet mapToPlatformVersionSet(ArrayRef Targets); -PlatformSet mapToPlatformSet(ArrayRef Targets); -ArchitectureSet mapToArchitectureSet(ArrayRef Targets); +LLVM_ABI PlatformVersionSet mapToPlatformVersionSet(ArrayRef Targets); +LLVM_ABI PlatformSet mapToPlatformSet(ArrayRef Targets); +LLVM_ABI ArchitectureSet mapToArchitectureSet(ArrayRef Targets); -std::string getTargetTripleName(const Target &Targ); +LLVM_ABI std::string getTargetTripleName(const Target &Targ); -raw_ostream &operator<<(raw_ostream &OS, const Target &Target); +LLVM_ABI raw_ostream &operator<<(raw_ostream &OS, const Target &Target); } // namespace MachO } // namespace llvm diff --git a/llvm/include/llvm/TextAPI/TextAPIError.h b/llvm/include/llvm/TextAPI/TextAPIError.h index f0578654697b8..7b2182edd6210 100644 --- a/llvm/include/llvm/TextAPI/TextAPIError.h +++ b/llvm/include/llvm/TextAPI/TextAPIError.h @@ -14,6 +14,7 @@ #ifndef LLVM_TEXTAPI_TEXTAPIERROR_H #define LLVM_TEXTAPI_TEXTAPIERROR_H +#include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" namespace llvm::MachO { @@ -25,7 +26,7 @@ enum class TextAPIErrorCode { UnsupportedTarget }; -class TextAPIError : public llvm::ErrorInfo { +class LLVM_ABI TextAPIError : public llvm::ErrorInfo { public: static char ID; TextAPIErrorCode EC; diff --git a/llvm/include/llvm/TextAPI/TextAPIReader.h b/llvm/include/llvm/TextAPI/TextAPIReader.h index 32af0e3601f18..603b24b47283d 100644 --- a/llvm/include/llvm/TextAPI/TextAPIReader.h +++ b/llvm/include/llvm/TextAPI/TextAPIReader.h @@ -9,6 +9,7 @@ #ifndef LLVM_TEXTAPI_TEXTAPIREADER_H #define LLVM_TEXTAPI_TEXTAPIREADER_H +#include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" namespace llvm { @@ -29,13 +30,13 @@ class TextAPIReader { /// /// \param InputBuffer Buffer holding contents of TAPI text file. /// \return The file format version of TAPI text file. - static Expected canRead(MemoryBufferRef InputBuffer); + LLVM_ABI static Expected canRead(MemoryBufferRef InputBuffer); /// Parse and get an InterfaceFile that represents the full /// library. /// /// \param InputBuffer Buffer holding contents of TAPI text file. - static Expected> + LLVM_ABI static Expected> get(MemoryBufferRef InputBuffer); TextAPIReader() = delete; diff --git a/llvm/include/llvm/TextAPI/TextAPIWriter.h b/llvm/include/llvm/TextAPI/TextAPIWriter.h index 7fd32c6fe2a9e..5f06c372fe852 100644 --- a/llvm/include/llvm/TextAPI/TextAPIWriter.h +++ b/llvm/include/llvm/TextAPI/TextAPIWriter.h @@ -10,6 +10,7 @@ #define LLVM_TEXTAPI_TEXTAPIWRITER_H #include "llvm/ADT/StringSwitch.h" +#include "llvm/Support/Compiler.h" #include "llvm/TextAPI/InterfaceFile.h" namespace llvm { @@ -30,9 +31,10 @@ class TextAPIWriter { /// \param FileKind File format to write text file as. If not specified, it /// will read from File. /// \param Compact Whether to limit whitespace in text file. - static Error writeToStream(raw_ostream &OS, const InterfaceFile &File, - const FileType FileKind = FileType::Invalid, - bool Compact = false); + LLVM_ABI static Error + writeToStream(raw_ostream &OS, const InterfaceFile &File, + const FileType FileKind = FileType::Invalid, + bool Compact = false); /// Get TAPI FileType from the input string. /// diff --git a/llvm/include/llvm/TextAPI/Utils.h b/llvm/include/llvm/TextAPI/Utils.h index 00dfd63e14f91..27db717f5a63b 100644 --- a/llvm/include/llvm/TextAPI/Utils.h +++ b/llvm/include/llvm/TextAPI/Utils.h @@ -14,6 +14,7 @@ #define LLVM_TEXTAPI_UTILS_H #include "llvm/ADT/Twine.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/Error.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" @@ -51,34 +52,35 @@ struct SymLink { /// /// \param Path Location of file. /// \param Extension File extension to update with. -void replace_extension(SmallVectorImpl &Path, const Twine &Extension); +LLVM_ABI void replace_extension(SmallVectorImpl &Path, + const Twine &Extension); /// Determine whether to skip over symlink due to either too many symlink levels /// or is cyclic. /// /// \param Path Location to symlink. /// \param Result Holds whether to skip over Path. -std::error_code shouldSkipSymLink(const Twine &Path, bool &Result); +LLVM_ABI std::error_code shouldSkipSymLink(const Twine &Path, bool &Result); /// Turn absolute symlink into relative. /// /// \param From The symlink. /// \param To What the symlink points to. /// \param RelativePath Path location to update what the symlink points to. -std::error_code make_relative(StringRef From, StringRef To, - SmallVectorImpl &RelativePath); +LLVM_ABI std::error_code make_relative(StringRef From, StringRef To, + SmallVectorImpl &RelativePath); /// Determine if library is private by parsing file path. /// It does not touch the file system. /// /// \param Path File path for library. /// \param IsSymLink Whether path points to a symlink. -bool isPrivateLibrary(StringRef Path, bool IsSymLink = false); +LLVM_ABI bool isPrivateLibrary(StringRef Path, bool IsSymLink = false); /// Create a regex rule from provided glob string. /// \param Glob String that represents glob input. /// \return The equivalent regex rule. -llvm::Expected createRegexFromGlob(llvm::StringRef Glob); +LLVM_ABI llvm::Expected createRegexFromGlob(llvm::StringRef Glob); using AliasEntry = std::pair; using AliasMap = std::map; @@ -87,14 +89,15 @@ using AliasMap = std::map; /// /// \param Buffer Data contents of file for the alias list. /// \return Lookup table of alias to their base symbol. -Expected parseAliasList(std::unique_ptr &Buffer); +LLVM_ABI Expected +parseAliasList(std::unique_ptr &Buffer); /// Pickup active paths for a given platform. /// /// \param Paths File or search paths to pick up. /// \param Platform Platform to collect paths for. -PathSeq getPathsForPlatform(const PathToPlatformSeq &Paths, - PlatformType Platform); +LLVM_ABI PathSeq getPathsForPlatform(const PathToPlatformSeq &Paths, + PlatformType Platform); } // namespace llvm::MachO #endif // LLVM_TEXTAPI_UTILS_H From 78765bb856bd6cdc3b1db48e80f74b8de5181f3f Mon Sep 17 00:00:00 2001 From: Jay Foad Date: Wed, 11 Jun 2025 17:23:04 +0100 Subject: [PATCH 101/851] [TableGen] Simplify computeUberWeights. NFC. (#143716) Using RegUnitIterator made the code more complicated than having two nested loops over each register and each register's regunits. --- .../TableGen/Common/CodeGenRegisters.cpp | 29 ++++++++----------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp index 5ec9b35379fa4..4d24eb3de1ed9 100644 --- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp @@ -1849,26 +1849,21 @@ static void computeUberWeights(MutableArrayRef UberSets, // Skip the first unallocatable set. for (UberRegSet &S : UberSets.drop_front()) { // Initialize all unit weights in this set, and remember the max units/reg. - const CodeGenRegister *Reg = nullptr; - unsigned MaxWeight = 0, Weight = 0; - for (RegUnitIterator UnitI(S.Regs); UnitI.isValid(); ++UnitI) { - if (Reg != UnitI.getReg()) { - if (Weight > MaxWeight) - MaxWeight = Weight; - Reg = UnitI.getReg(); - Weight = 0; - } - if (!RegBank.getRegUnit(*UnitI).Artificial) { - unsigned UWeight = RegBank.getRegUnit(*UnitI).Weight; - if (!UWeight) { - UWeight = 1; - RegBank.increaseRegUnitWeight(*UnitI, UWeight); + unsigned MaxWeight = 0; + for (const CodeGenRegister *R : S.Regs) { + unsigned Weight = 0; + for (unsigned U : R->getRegUnits()) { + if (!RegBank.getRegUnit(U).Artificial) { + unsigned UWeight = RegBank.getRegUnit(U).Weight; + if (!UWeight) { + UWeight = 1; + RegBank.increaseRegUnitWeight(U, UWeight); + } + Weight += UWeight; } - Weight += UWeight; } + MaxWeight = std::max(MaxWeight, Weight); } - if (Weight > MaxWeight) - MaxWeight = Weight; if (S.Weight != MaxWeight) { LLVM_DEBUG({ dbgs() << "UberSet " << &S - UberSets.begin() << " Weight " From 8e4f0d8614dcd48cfe2d885a021e2927c1bc8616 Mon Sep 17 00:00:00 2001 From: Morris Hafner Date: Wed, 11 Jun 2025 18:24:46 +0200 Subject: [PATCH 102/851] [CIR] Upstream minimal builtin function call support (#142981) This patch adds all bits required to implement builtin function calls to ClangIR. It doesn't actually implement any of the builtins except those that fold to a constant ahead of CodeGen (`__builtin_is_constant_evaluated()` being one example). --- clang/include/clang/CIR/MissingFeatures.h | 3 +- clang/lib/CIR/CodeGen/CIRGenBuilder.cpp | 28 ++++++++ clang/lib/CIR/CodeGen/CIRGenBuilder.h | 11 ++++ clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp | 55 ++++++++++++++++ clang/lib/CIR/CodeGen/CIRGenCall.h | 30 ++++++++- clang/lib/CIR/CodeGen/CIRGenExpr.cpp | 53 +++++++++++++-- clang/lib/CIR/CodeGen/CIRGenFunction.h | 5 ++ clang/lib/CIR/CodeGen/CMakeLists.txt | 1 + clang/test/CIR/CodeGen/builtin_call.cpp | 78 +++++++++++++++++++++++ 9 files changed, 255 insertions(+), 9 deletions(-) create mode 100644 clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp create mode 100644 clang/test/CIR/CodeGen/builtin_call.cpp diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h index f89d386378e51..87908e2ec08ac 100644 --- a/clang/include/clang/CIR/MissingFeatures.h +++ b/clang/include/clang/CIR/MissingFeatures.h @@ -83,7 +83,6 @@ struct MissingFeatures { static bool opFuncSetComdat() { return false; } // CallOp handling - static bool opCallBuiltinFunc() { return false; } static bool opCallPseudoDtor() { return false; } static bool opCallAggregateArgs() { return false; } static bool opCallPaddingArgs() { return false; } @@ -225,6 +224,8 @@ struct MissingFeatures { static bool isMemcpyEquivalentSpecialMember() { return false; } static bool isTrivialCtorOrDtor() { return false; } static bool implicitConstructorArgs() { return false; } + static bool intrinsics() { return false; } + static bool attributeNoBuiltin() { return false; } // Missing types static bool dataMemberType() { return false; } diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp index 4c8c6ed289c3b..9cec17bcb2fd0 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp @@ -39,6 +39,34 @@ mlir::Value CIRGenBuilderTy::getArrayElement(mlir::Location arrayLocBegin, return create(arrayLocEnd, flatPtrTy, basePtr, idx); } +cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc, + llvm::APSInt intVal) { + bool isSigned = intVal.isSigned(); + unsigned width = intVal.getBitWidth(); + cir::IntType t = isSigned ? getSIntNTy(width) : getUIntNTy(width); + return getConstInt(loc, t, + isSigned ? intVal.getSExtValue() : intVal.getZExtValue()); +} + +cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc, + llvm::APInt intVal) { + return getConstInt(loc, llvm::APSInt(intVal)); +} + +cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc, mlir::Type t, + uint64_t c) { + assert(mlir::isa(t) && "expected cir::IntType"); + return create(loc, cir::IntAttr::get(t, c)); +} + +cir::ConstantOp +clang::CIRGen::CIRGenBuilderTy::getConstFP(mlir::Location loc, mlir::Type t, + llvm::APFloat fpVal) { + assert(mlir::isa(t) && + "expected floating point type"); + return create(loc, getAttr(t, fpVal)); +} + // This can't be defined in Address.h because that file is included by // CIRGenBuilder.h Address Address::withElementType(CIRGenBuilderTy &builder, diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h index 03077ee062a65..fb1a290c18fa2 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h +++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h @@ -11,10 +11,12 @@ #include "Address.h" #include "CIRGenTypeCache.h" +#include "clang/CIR/Interfaces/CIRFPTypeInterface.h" #include "clang/CIR/MissingFeatures.h" #include "clang/CIR/Dialect/Builder/CIRBaseBuilder.h" #include "clang/CIR/MissingFeatures.h" +#include "llvm/ADT/APFloat.h" #include "llvm/ADT/STLExtras.h" namespace clang::CIRGen { @@ -229,6 +231,15 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy { cir::IntType getUInt32Ty() { return typeCache.UInt32Ty; } cir::IntType getUInt64Ty() { return typeCache.UInt64Ty; } + cir::ConstantOp getConstInt(mlir::Location loc, llvm::APSInt intVal); + + cir::ConstantOp getConstInt(mlir::Location loc, llvm::APInt intVal); + + cir::ConstantOp getConstInt(mlir::Location loc, mlir::Type t, uint64_t c); + + cir::ConstantOp getConstFP(mlir::Location loc, mlir::Type t, + llvm::APFloat fpVal); + bool isInt8Ty(mlir::Type i) { return i == typeCache.UInt8Ty || i == typeCache.SInt8Ty; } diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp new file mode 100644 index 0000000000000..c59ac78210f81 --- /dev/null +++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp @@ -0,0 +1,55 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This contains code to emit Builtin calls as CIR or a function call to be +// later resolved. +// +//===----------------------------------------------------------------------===// + +#include "CIRGenCall.h" +#include "CIRGenFunction.h" +#include "CIRGenModule.h" +#include "CIRGenValue.h" +#include "mlir/IR/BuiltinAttributes.h" +#include "mlir/IR/Value.h" +#include "mlir/Support/LLVM.h" +#include "clang/AST/Expr.h" +#include "clang/AST/GlobalDecl.h" +#include "llvm/Support/ErrorHandling.h" + +using namespace clang; +using namespace clang::CIRGen; + +RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID, + const CallExpr *e, + ReturnValueSlot returnValue) { + // See if we can constant fold this builtin. If so, don't emit it at all. + // TODO: Extend this handling to all builtin calls that we can constant-fold. + Expr::EvalResult result; + if (e->isPRValue() && e->EvaluateAsRValue(result, cgm.getASTContext()) && + !result.hasSideEffects()) { + if (result.Val.isInt()) { + return RValue::get(builder.getConstInt(getLoc(e->getSourceRange()), + result.Val.getInt())); + } + if (result.Val.isFloat()) { + // Note: we are using result type of CallExpr to determine the type of + // the constant. Classic codegen uses the result value to determine the + // type. We feel it should be Ok to use expression type because it is + // hard to imagine a builtin function evaluates to a value that + // over/underflows its own defined type. + mlir::Type type = convertType(e->getType()); + return RValue::get(builder.getConstFP(getLoc(e->getExprLoc()), type, + result.Val.getFloat())); + } + } + + mlir::Location loc = getLoc(e->getExprLoc()); + cgm.errorNYI(loc, "non constant foldable builtin calls"); + return getUndefRValue(e->getType()); +} diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.h b/clang/lib/CIR/CodeGen/CIRGenCall.h index 605625705a75c..15c9080448c8b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenCall.h +++ b/clang/lib/CIR/CodeGen/CIRGenCall.h @@ -44,16 +44,25 @@ class CIRGenCalleeInfo { class CIRGenCallee { enum class SpecialKind : uintptr_t { Invalid, + Builtin, - Last = Invalid, + Last = Builtin, + }; + + struct BuiltinInfoStorage { + const clang::FunctionDecl *decl; + unsigned id; }; SpecialKind kindOrFunctionPtr; union { CIRGenCalleeInfo abstractInfo; + BuiltinInfoStorage builtinInfo; }; + explicit CIRGenCallee(SpecialKind kind) : kindOrFunctionPtr(kind) {} + public: CIRGenCallee() : kindOrFunctionPtr(SpecialKind::Invalid) {} @@ -69,6 +78,25 @@ class CIRGenCallee { return CIRGenCallee(abstractInfo, funcPtr); } + bool isBuiltin() const { return kindOrFunctionPtr == SpecialKind::Builtin; } + + const clang::FunctionDecl *getBuiltinDecl() const { + assert(isBuiltin()); + return builtinInfo.decl; + } + unsigned getBuiltinID() const { + assert(isBuiltin()); + return builtinInfo.id; + } + + static CIRGenCallee forBuiltin(unsigned builtinID, + const clang::FunctionDecl *builtinDecl) { + CIRGenCallee result(SpecialKind::Builtin); + result.builtinInfo.decl = builtinDecl; + result.builtinInfo.id = builtinID; + return result; + } + bool isOrdinary() const { return uintptr_t(kindOrFunctionPtr) > uintptr_t(SpecialKind::Last); } diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp index f2c2de7a4f59d..f1f86509c9a9b 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp @@ -1029,8 +1029,48 @@ static cir::FuncOp emitFunctionDeclPointer(CIRGenModule &cgm, GlobalDecl gd) { return cgm.getAddrOfFunction(gd); } -static CIRGenCallee emitDirectCallee(CIRGenModule &cgm, GlobalDecl gd) { - assert(!cir::MissingFeatures::opCallBuiltinFunc()); +// Detect the unusual situation where an inline version is shadowed by a +// non-inline version. In that case we should pick the external one +// everywhere. That's GCC behavior too. +static bool onlyHasInlineBuiltinDeclaration(const FunctionDecl *fd) { + for (const FunctionDecl *pd = fd; pd; pd = pd->getPreviousDecl()) + if (!pd->isInlineBuiltinDeclaration()) + return false; + return true; +} + +CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) { + const auto *fd = cast(gd.getDecl()); + + if (unsigned builtinID = fd->getBuiltinID()) { + if (fd->getAttr()) { + cgm.errorNYI("AsmLabelAttr"); + } + + StringRef ident = fd->getName(); + std::string fdInlineName = (ident + ".inline").str(); + + bool isPredefinedLibFunction = + cgm.getASTContext().BuiltinInfo.isPredefinedLibFunction(builtinID); + bool hasAttributeNoBuiltin = false; + assert(!cir::MissingFeatures::attributeNoBuiltin()); + + // When directing calling an inline builtin, call it through it's mangled + // name to make it clear it's not the actual builtin. + auto fn = cast(curFn); + if (fn.getName() != fdInlineName && onlyHasInlineBuiltinDeclaration(fd)) { + cgm.errorNYI("Inline only builtin function calls"); + } + + // Replaceable builtins provide their own implementation of a builtin. If we + // are in an inline builtin implementation, avoid trivial infinite + // recursion. Honor __attribute__((no_builtin("foo"))) or + // __attribute__((no_builtin)) on the current function unless foo is + // not a predefined library function which means we must generate the + // builtin no matter what. + else if (!isPredefinedLibFunction || !hasAttributeNoBuiltin) + return CIRGenCallee::forBuiltin(builtinID, fd); + } cir::FuncOp callee = emitFunctionDeclPointer(cgm, gd); @@ -1106,7 +1146,7 @@ CIRGenCallee CIRGenFunction::emitCallee(const clang::Expr *e) { } else if (const auto *declRef = dyn_cast(e)) { // Resolve direct calls. const auto *funcDecl = cast(declRef->getDecl()); - return emitDirectCallee(cgm, funcDecl); + return emitDirectCallee(funcDecl); } else if (isa(e)) { cgm.errorNYI(e->getSourceRange(), "emitCallee: call to member function is NYI"); @@ -1162,10 +1202,9 @@ RValue CIRGenFunction::emitCallExpr(const clang::CallExpr *e, CIRGenCallee callee = emitCallee(e->getCallee()); - if (e->getBuiltinCallee()) { - cgm.errorNYI(e->getSourceRange(), "call to builtin functions"); - } - assert(!cir::MissingFeatures::opCallBuiltinFunc()); + if (callee.isBuiltin()) + return emitBuiltinExpr(callee.getBuiltinDecl(), callee.getBuiltinID(), e, + returnValue); if (isa(e->getCallee())) { cgm.errorNYI(e->getSourceRange(), "call to pseudo destructor"); diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index 7db7f6928fd8f..b08dd540e6289 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -665,6 +665,8 @@ class CIRGenFunction : public CIRGenTypeCache { void emitAndUpdateRetAlloca(clang::QualType type, mlir::Location loc, clang::CharUnits alignment); + CIRGenCallee emitDirectCallee(const GlobalDecl &gd); + public: Address emitAddrOfFieldStorage(Address base, const FieldDecl *field, llvm::StringRef fieldName, @@ -711,6 +713,9 @@ class CIRGenFunction : public CIRGenTypeCache { mlir::LogicalResult emitBreakStmt(const clang::BreakStmt &s); + RValue emitBuiltinExpr(const clang::GlobalDecl &gd, unsigned builtinID, + const clang::CallExpr *e, ReturnValueSlot returnValue); + RValue emitCall(const CIRGenFunctionInfo &funcInfo, const CIRGenCallee &callee, ReturnValueSlot returnValue, const CallArgList &args, cir::CIRCallOpInterface *callOp, diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt index 8bfcd2773d07a..beaa9afb31f93 100644 --- a/clang/lib/CIR/CodeGen/CMakeLists.txt +++ b/clang/lib/CIR/CodeGen/CMakeLists.txt @@ -13,6 +13,7 @@ add_clang_library(clangCIR CIRGenClass.cpp CIRGenCXXABI.cpp CIRGenCXXExpr.cpp + CIRGenBuiltin.cpp CIRGenDecl.cpp CIRGenDeclOpenACC.cpp CIRGenExpr.cpp diff --git a/clang/test/CIR/CodeGen/builtin_call.cpp b/clang/test/CIR/CodeGen/builtin_call.cpp new file mode 100644 index 0000000000000..2706ea7f8f857 --- /dev/null +++ b/clang/test/CIR/CodeGen/builtin_call.cpp @@ -0,0 +1,78 @@ +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s -check-prefix=CIR +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -fclangir -emit-llvm %s -o %t-cir.ll +// RUN: FileCheck --input-file=%t-cir.ll %s -check-prefix=LLVM +// RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-linux-gnu -Wno-unused-value -emit-llvm %s -o %t.ll +// RUN: FileCheck --input-file=%t.ll %s -check-prefix=OGCG + +constexpr extern int cx_var = __builtin_is_constant_evaluated(); + +// CIR: cir.global {{.*}} @cx_var = #cir.int<1> : !s32i +// LLVM: @cx_var = {{.*}} i32 1 +// OGCG: @cx_var = {{.*}} i32 1 + +constexpr extern float cx_var_single = __builtin_huge_valf(); + +// CIR: cir.global {{.*}} @cx_var_single = #cir.fp<0x7F800000> : !cir.float +// LLVM: @cx_var_single = {{.*}} float 0x7FF0000000000000 +// OGCG: @cx_var_single = {{.*}} float 0x7FF0000000000000 + +constexpr extern long double cx_var_ld = __builtin_huge_vall(); + +// CIR: cir.global {{.*}} @cx_var_ld = #cir.fp<0x7FFF8000000000000000> : !cir.long_double +// LLVM: @cx_var_ld = {{.*}} x86_fp80 0xK7FFF8000000000000000 +// OGCG: @cx_var_ld = {{.*}} x86_fp80 0xK7FFF8000000000000000 + +int is_constant_evaluated() { + return __builtin_is_constant_evaluated(); +} + +// CIR: cir.func @_Z21is_constant_evaluatedv() -> !s32i +// CIR: %[[ZERO:.+]] = cir.const #cir.int<0> + +// LLVM: define {{.*}}i32 @_Z21is_constant_evaluatedv() +// LLVM: %[[MEM:.+]] = alloca i32 +// LLVM: store i32 0, ptr %[[MEM]] +// LLVM: %[[RETVAL:.+]] = load i32, ptr %[[MEM]] +// LLVM: ret i32 %[[RETVAL]] +// LLVM: } + +// OGCG: define {{.*}}i32 @_Z21is_constant_evaluatedv() +// OGCG: ret i32 0 +// OGCG: } + +long double constant_fp_builtin_ld() { + return __builtin_fabsl(-0.1L); +} + +// CIR: cir.func @_Z22constant_fp_builtin_ldv() -> !cir.long_double +// CIR: %[[PONE:.+]] = cir.const #cir.fp<1.000000e-01> : !cir.long_double + +// LLVM: define {{.*}}x86_fp80 @_Z22constant_fp_builtin_ldv() +// LLVM: %[[MEM:.+]] = alloca x86_fp80 +// LLVM: store x86_fp80 0xK3FFBCCCCCCCCCCCCCCCD, ptr %[[MEM]] +// LLVM: %[[RETVAL:.+]] = load x86_fp80, ptr %[[MEM]] +// LLVM: ret x86_fp80 %[[RETVAL]] +// LLVM: } + +// OGCG: define {{.*}}x86_fp80 @_Z22constant_fp_builtin_ldv() +// OGCG: ret x86_fp80 0xK3FFBCCCCCCCCCCCCCCCD +// OGCG: } + +float constant_fp_builtin_single() { + return __builtin_fabsf(-0.1f); +} + +// CIR: cir.func @_Z26constant_fp_builtin_singlev() -> !cir.float +// CIR: %[[PONE:.+]] = cir.const #cir.fp<1.000000e-01> : !cir.float + +// LLVM: define {{.*}}float @_Z26constant_fp_builtin_singlev() +// LLVM: %[[MEM:.+]] = alloca float +// LLVM: store float 0x3FB99999A0000000, ptr %[[MEM]] +// LLVM: %[[RETVAL:.+]] = load float, ptr %[[MEM]] +// LLVM: ret float %[[RETVAL]] +// LLVM: } + +// OGCG: define {{.*}}float @_Z26constant_fp_builtin_singlev() +// OGCG: ret float 0x3FB99999A0000000 +// OGCG: } From ec8d68b59f82423e5a6bf452e33ee8c5f64b0edc Mon Sep 17 00:00:00 2001 From: vabridgers <58314289+vabridgers@users.noreply.github.com> Date: Wed, 11 Jun 2025 11:25:24 -0500 Subject: [PATCH 103/851] [clang][analyzer] Correct SMT Layer for _BitInt cases refutations (#143310) Since _BitInt was added later, ASTContext did not comprehend getting a type by bitwidth that's not a power of 2, and the SMT layer also did not comprehend this. This led to unexpected crashes using Z3 refutation during randomized testing. The assertion and redacted and summarized crash stack is shown here. clang: ../../clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h:103: static llvm::SMTExprRef clang::ento::SMTConv::fromBinOp(llvm::SMTSolverRef &, const llvm::SMTExprRef &, const BinaryOperator::Opcode, const llvm::SMTExprRef &, bool): Assertion `*Solver->getSort(LHS) == *Solver->getSort(RHS) && "AST's must have the same sort!"' failed. ...
clang::ento::SMTConv::fromBinOp(std::shared_ptr&, llvm::SMTExpr const* const&, clang::BinaryOperatorKind, llvm::SMTExpr const* const&, bool) SMTConstraintManager.cpp clang::ASTContext&, llvm::SMTExpr const* const&, clang::QualType, clang::BinaryOperatorKind, llvm::SMTExpr const* const&, clang::QualType, clang::QualType*) SMTConstraintManager.cpp clang::ASTContext&, clang::ento::SymExpr const*, llvm::APSInt const&, llvm::APSInt const&, bool) SMTConstraintManager.cpp clang::ento::ExplodedNode const*, clang::ento::PathSensitiveBugReport&) --------- Co-authored-by: Vince Bridgers --- .../Core/PathSensitive/SMTConv.h | 28 ++++++++++++++----- clang/test/Analysis/bitint-z3.c | 22 +++++++++++++++ 2 files changed, 43 insertions(+), 7 deletions(-) create mode 100644 clang/test/Analysis/bitint-z3.c diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h index 580b49a38dc72..70a7953918ace 100644 --- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h +++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SMTConv.h @@ -18,6 +18,8 @@ #include "clang/StaticAnalyzer/Core/PathSensitive/SymbolManager.h" #include "llvm/Support/SMTAPI.h" +#include + namespace clang { namespace ento { @@ -570,23 +572,35 @@ class SMTConv { // TODO: Refactor to put elsewhere static inline QualType getAPSIntType(ASTContext &Ctx, const llvm::APSInt &Int) { - return Ctx.getIntTypeForBitwidth(Int.getBitWidth(), Int.isSigned()); + const QualType Ty = + Ctx.getIntTypeForBitwidth(Int.getBitWidth(), Int.isSigned()); + if (!Ty.isNull()) + return Ty; + // If Ty is Null, could be because the original type was a _BitInt. + // Get the size of the _BitInt type (expressed in bits) and round it up to + // the next power of 2 that is at least the bit size of 'char' (usually 8). + unsigned CharTypeSize = Ctx.getTypeSize(Ctx.CharTy); + unsigned Pow2DestWidth = + std::max(llvm::bit_ceil(Int.getBitWidth()), CharTypeSize); + return Ctx.getIntTypeForBitwidth(Pow2DestWidth, Int.isSigned()); } // Get the QualTy for the input APSInt, and fix it if it has a bitwidth of 1. static inline std::pair fixAPSInt(ASTContext &Ctx, const llvm::APSInt &Int) { llvm::APSInt NewInt; + unsigned APSIntBitwidth = Int.getBitWidth(); + QualType Ty = getAPSIntType(Ctx, Int); // FIXME: This should be a cast from a 1-bit integer type to a boolean type, // but the former is not available in Clang. Instead, extend the APSInt // directly. - if (Int.getBitWidth() == 1 && getAPSIntType(Ctx, Int).isNull()) { - NewInt = Int.extend(Ctx.getTypeSize(Ctx.BoolTy)); - } else - NewInt = Int; - - return std::make_pair(NewInt, getAPSIntType(Ctx, NewInt)); + if (APSIntBitwidth == 1 && Ty.isNull()) + return {Int.extend(Ctx.getTypeSize(Ctx.BoolTy)), + getAPSIntType(Ctx, NewInt)}; + if (llvm::isPowerOf2_32(APSIntBitwidth) || Ty.isNull()) + return {Int, Ty}; + return {Int.extend(Ctx.getTypeSize(Ty)), Ty}; } // Perform implicit type conversion on binary symbolic expressions. diff --git a/clang/test/Analysis/bitint-z3.c b/clang/test/Analysis/bitint-z3.c new file mode 100644 index 0000000000000..4cb97f9de8299 --- /dev/null +++ b/clang/test/Analysis/bitint-z3.c @@ -0,0 +1,22 @@ +// RUN: %clang_analyze_cc1 -analyzer-checker=core,debug.ExprInspection -w \ +// RUN: -analyzer-config crosscheck-with-z3=true -verify %s +// REQUIRES: z3 + +// Previously these tests were crashing because the SMTConv layer did not +// comprehend the _BitInt types. + +void clang_analyzer_warnIfReached(); + +void c(int b, _BitInt(35) a) { + int d = 0; + if (a) + b = d; + clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}} +} + +void f(int *d, _BitInt(3) e) { + int g; + d = &g; + e ?: 0; + clang_analyzer_warnIfReached(); // expected-warning{{REACHABLE}} +} From fe7bf4b90b1a835418bddd2b2aa63b4977a9f6d2 Mon Sep 17 00:00:00 2001 From: Rolf Morel <854835+rolfmorel@users.noreply.github.com> Date: Wed, 11 Jun 2025 17:33:55 +0100 Subject: [PATCH 104/851] [MLIR][Transform] apply_registered_pass op's options as a dict (#143159) Improve ApplyRegisteredPassOp's support for taking options by taking them as a dict (vs a list of string-valued key-value pairs). Values of options are provided as either static attributes or as params (which pass in attributes at interpreter runtime). In either case, the keys and value attributes are converted to strings and a single options-string, in the format used on the commandline, is constructed to pass to the `addToPipeline`-pass API. --- .../mlir/Dialect/Transform/IR/CMakeLists.txt | 4 + .../Dialect/Transform/IR/TransformAttrs.h | 3 + .../Dialect/Transform/IR/TransformAttrs.td | 19 ++ .../Dialect/Transform/IR/TransformDialect.td | 1 + .../mlir/Dialect/Transform/IR/TransformOps.td | 23 +- .../Dialect/Transform/IR/TransformDialect.cpp | 9 + .../lib/Dialect/Transform/IR/TransformOps.cpp | 223 +++++++++++------- .../mlir/dialects/transform/__init__.py | 82 ++++++- .../Transform/test-pass-application.mlir | 169 +++++++++++-- mlir/test/python/dialects/transform.py | 52 ++++ 10 files changed, 469 insertions(+), 116 deletions(-) diff --git a/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt b/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt index df5af7ae710da..9acab9228f100 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt +++ b/mlir/include/mlir/Dialect/Transform/IR/CMakeLists.txt @@ -20,6 +20,10 @@ mlir_tablegen(TransformDialectEnums.h.inc -gen-enum-decls) mlir_tablegen(TransformDialectEnums.cpp.inc -gen-enum-defs) add_public_tablegen_target(MLIRTransformDialectEnumIncGen) add_dependencies(mlir-headers MLIRTransformDialectEnumIncGen) +mlir_tablegen(TransformAttrs.h.inc -gen-attrdef-decls) +mlir_tablegen(TransformAttrs.cpp.inc -gen-attrdef-defs) +add_public_tablegen_target(MLIRTransformDialectAttributesIncGen) +add_dependencies(mlir-headers MLIRTransformDialectAttributesIncGen) add_mlir_dialect(TransformOps transform) add_mlir_doc(TransformOps TransformOps Dialects/ -gen-op-doc -dialect=transform) diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h index 3cb935003b4c4..379af932ca484 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.h @@ -17,4 +17,7 @@ #include "mlir/Dialect/Transform/IR/TransformDialectEnums.h.inc" +#define GET_ATTRDEF_CLASSES +#include "mlir/Dialect/Transform/IR/TransformAttrs.h.inc" + #endif // MLIR_DIALECT_TRANSFORM_IR_TRANSFORMATTRS_H diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td index ebad2994880e7..e67a9444c24a8 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformAttrs.td @@ -10,6 +10,14 @@ #define MLIR_DIALECT_TRANSFORM_IR_TRANSFORMATTRS include "mlir/IR/EnumAttr.td" +include "mlir/Dialect/Transform/IR/TransformDialect.td" + +class Transform_Attr traits = [], + string baseCppClass = "::mlir::Attribute"> + : AttrDef { + let mnemonic = attrMnemonic; +} def PropagateFailuresCase : I32EnumAttrCase<"Propagate", 1, "propagate">; def SuppressFailuresCase : I32EnumAttrCase<"Suppress", 2, "suppress">; @@ -33,4 +41,15 @@ def MatchCmpIPredicateAttr : I32EnumAttr< let cppNamespace = "::mlir::transform"; } +def ParamOperandAttr : Transform_Attr<"ParamOperand", "param_operand"> { + let description = [{ + Used to refer to a specific param-operand (via its index) from within an + attribute on a transform operation. + }]; + let parameters = (ins + "IntegerAttr":$index + ); + let assemblyFormat = "`<` `index` `=` $index `>`"; +} + #endif // MLIR_DIALECT_TRANSFORM_IR_TRANSFORMATTRS diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td b/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td index d03049e186f94..c7ea5ade72ace 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformDialect.td @@ -19,6 +19,7 @@ def Transform_Dialect : Dialect { let cppNamespace = "::mlir::transform"; let hasOperationAttrVerify = 1; + let useDefaultAttributePrinterParser = 1; let extraClassDeclaration = [{ /// Symbol name for the default entry point "named sequence". constexpr const static ::llvm::StringLiteral diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td index e864a65f8ceac..f75ba27e58e76 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td @@ -405,10 +405,23 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass", let description = [{ This transform applies the specified pass or pass pipeline to the targeted ops. The name of the pass/pipeline is specified as a string attribute, as - set during pass/pipeline registration. Optionally, pass options may be - specified as (space-separated) string attributes with the option to pass - these attributes via params. The pass options syntax is identical to the one - used with "mlir-opt". + set during pass/pipeline registration. + + Optionally, pass options may be specified via a DictionaryAttr. This + dictionary is converted to a string -- formatted `key=value ...` -- which + is expected to be in the exact format used by the pass on the commandline. + Values are either attributes or (SSA-values of) Transform Dialect params. + For example: + + ```mlir + transform.apply_registered_pass "canonicalize" + with options = { "top-down" = false, + "max-iterations" = %max_iter, + "test-convergence" = true, + "max-num-rewrites" = %max_rewrites } + to %module + : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + ``` This op first looks for a pass pipeline with the specified name. If no such pipeline exists, it looks for a pass with the specified name. If no such @@ -422,7 +435,7 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass", }]; let arguments = (ins StrAttr:$pass_name, - DefaultValuedAttr:$options, + DefaultValuedAttr:$options, Variadic:$dynamic_options, TransformHandleTypeInterface:$target); let results = (outs TransformHandleTypeInterface:$result); diff --git a/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp b/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp index 497ceb19f1a21..4a95fe7459e8c 100644 --- a/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp +++ b/mlir/lib/Dialect/Transform/IR/TransformDialect.cpp @@ -8,17 +8,22 @@ #include "mlir/Dialect/Transform/IR/TransformDialect.h" #include "mlir/Analysis/CallGraph.h" +#include "mlir/Dialect/Transform/IR/TransformAttrs.h" #include "mlir/Dialect/Transform/IR/TransformOps.h" #include "mlir/Dialect/Transform/IR/TransformTypes.h" #include "mlir/Dialect/Transform/IR/Utils.h" #include "mlir/Dialect/Transform/Interfaces/TransformInterfaces.h" #include "mlir/IR/DialectImplementation.h" #include "llvm/ADT/SCCIterator.h" +#include "llvm/ADT/TypeSwitch.h" using namespace mlir; #include "mlir/Dialect/Transform/IR/TransformDialect.cpp.inc" +#define GET_ATTRDEF_CLASSES +#include "mlir/Dialect/Transform/IR/TransformAttrs.cpp.inc" + #ifndef NDEBUG void transform::detail::checkImplementsTransformOpInterface( StringRef name, MLIRContext *context) { @@ -66,6 +71,10 @@ void transform::TransformDialect::initialize() { #include "mlir/Dialect/Transform/IR/TransformOps.cpp.inc" >(); initializeTypes(); + addAttributes< +#define GET_ATTRDEF_LIST +#include "mlir/Dialect/Transform/IR/TransformAttrs.cpp.inc" + >(); initializeLibraryModule(); } diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp index a0f9518e3d12f..582d082153bef 100644 --- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp +++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp @@ -54,10 +54,11 @@ using namespace mlir; static ParseResult parseApplyRegisteredPassOptions( - OpAsmParser &parser, ArrayAttr &options, + OpAsmParser &parser, DictionaryAttr &options, SmallVectorImpl &dynamicOptions); static void printApplyRegisteredPassOptions(OpAsmPrinter &printer, - Operation *op, ArrayAttr options, + Operation *op, + DictionaryAttr options, ValueRange dynamicOptions); static ParseResult parseSequenceOpOperands( OpAsmParser &parser, std::optional &root, @@ -784,41 +785,50 @@ DiagnosedSilenceableFailure transform::ApplyRegisteredPassOp::apply(transform::TransformRewriter &rewriter, transform::TransformResults &results, transform::TransformState &state) { - // Obtain a single options-string from options passed statically as - // string attributes as well as "dynamically" through params. + // Obtain a single options-string to pass to the pass(-pipeline) from options + // passed in as a dictionary of keys mapping to values which are either + // attributes or param-operands pointing to attributes. + std::string options; + llvm::raw_string_ostream optionsStream(options); // For "printing" attrs. + OperandRange dynamicOptions = getDynamicOptions(); - size_t dynamicOptionsIdx = 0; - for (auto [idx, optionAttr] : llvm::enumerate(getOptions())) { + for (auto [idx, namedAttribute] : llvm::enumerate(getOptions())) { if (idx > 0) - options += " "; // Interleave options seperator. - - if (auto strAttr = dyn_cast(optionAttr)) { - options += strAttr.getValue(); - } else if (isa(optionAttr)) { - assert(dynamicOptionsIdx < dynamicOptions.size() && + optionsStream << " "; // Interleave options separator. + optionsStream << namedAttribute.getName().str(); // Append the key. + optionsStream << "="; // And the key-value separator. + + Attribute valueAttrToAppend; + if (auto paramOperandIndex = + dyn_cast(namedAttribute.getValue())) { + // The corresponding value attribute is passed in via a param. + // Obtain the param-operand via its specified index. + size_t dynamicOptionIdx = paramOperandIndex.getIndex().getInt(); + assert(dynamicOptionIdx < dynamicOptions.size() && "number of dynamic option markers (UnitAttr) in options ArrayAttr " "should be the same as the number of options passed as params"); ArrayRef dynamicOption = - state.getParams(dynamicOptions[dynamicOptionsIdx++]); + state.getParams(dynamicOptions[dynamicOptionIdx]); if (dynamicOption.size() != 1) - return emitSilenceableError() << "options passed as a param must have " - "a single value associated, param " - << dynamicOptionsIdx - 1 << " associates " - << dynamicOption.size(); - - if (auto dynamicOptionStr = dyn_cast(dynamicOption[0])) { - options += dynamicOptionStr.getValue(); - } else { return emitSilenceableError() - << "options passed as a param must be a string, got " - << dynamicOption[0]; - } + << "options passed as a param must have " + "a single value associated, param " + << dynamicOptionIdx << " associates " << dynamicOption.size(); + valueAttrToAppend = dynamicOption[0]; + } else { + // Value is a static attribute. + valueAttrToAppend = namedAttribute.getValue(); + } + + // Append string representation of value attribute. + if (auto strAttr = dyn_cast(valueAttrToAppend)) { + optionsStream << strAttr.getValue().str(); } else { - llvm_unreachable( - "expected options element to be either StringAttr or UnitAttr"); + valueAttrToAppend.print(optionsStream, /*elideType=*/true); } } + optionsStream.flush(); // Get pass or pass pipeline from registry. const PassRegistryEntry *info = PassPipelineInfo::lookup(getPassName()); @@ -864,84 +874,121 @@ transform::ApplyRegisteredPassOp::apply(transform::TransformRewriter &rewriter, } static ParseResult parseApplyRegisteredPassOptions( - OpAsmParser &parser, ArrayAttr &options, + OpAsmParser &parser, DictionaryAttr &options, SmallVectorImpl &dynamicOptions) { - auto dynamicOptionMarker = UnitAttr::get(parser.getContext()); - SmallVector optionsArray; - - auto parseOperandOrString = [&]() -> OptionalParseResult { - OpAsmParser::UnresolvedOperand operand; - OptionalParseResult parsedOperand = parser.parseOptionalOperand(operand); - if (parsedOperand.has_value()) { - if (failed(parsedOperand.value())) - return failure(); - - dynamicOptions.push_back(operand); - optionsArray.push_back( - dynamicOptionMarker); // Placeholder for knowing where to - // inject the dynamic option-as-param. - return success(); - } + // Construct the options DictionaryAttr per a `{ key = value, ... }` syntax. + SmallVector keyValuePairs; - StringAttr stringAttr; - OptionalParseResult parsedStringAttr = - parser.parseOptionalAttribute(stringAttr); - if (parsedStringAttr.has_value()) { - if (failed(parsedStringAttr.value())) - return failure(); - optionsArray.push_back(stringAttr); - return success(); - } + size_t dynamicOptionsIdx = 0; + auto parseKeyValuePair = [&]() -> ParseResult { + // Parse items of the form `key = value` where `key` is a bare identifier or + // a string and `value` is either an attribute or an operand. + + std::string key; + Attribute valueAttr; + if (parser.parseOptionalKeywordOrString(&key)) + return parser.emitError(parser.getCurrentLocation()) + << "expected key to either be an identifier or a string"; + if (key.empty()) + return failure(); - return std::nullopt; + if (parser.parseEqual()) + return parser.emitError(parser.getCurrentLocation()) + << "expected '=' after key in key-value pair"; + + // Parse the value, which can be either an attribute or an operand. + OptionalParseResult parsedValueAttr = + parser.parseOptionalAttribute(valueAttr); + if (!parsedValueAttr.has_value()) { + OpAsmParser::UnresolvedOperand operand; + ParseResult parsedOperand = parser.parseOperand(operand); + if (failed(parsedOperand)) + return parser.emitError(parser.getCurrentLocation()) + << "expected a valid attribute or operand as value associated " + << "to key '" << key << "'"; + // To make use of the operand, we need to store it in the options dict. + // As SSA-values cannot occur in attributes, what we do instead is store + // an attribute in its place that contains the index of the param-operand, + // so that an attr-value associated to the param can be resolved later on. + dynamicOptions.push_back(operand); + auto wrappedIndex = IntegerAttr::get( + IntegerType::get(parser.getContext(), 64), dynamicOptionsIdx++); + valueAttr = + transform::ParamOperandAttr::get(parser.getContext(), wrappedIndex); + } else if (failed(parsedValueAttr.value())) { + return failure(); // NB: Attempted parse should have output error message. + } else if (isa(valueAttr)) { + return parser.emitError(parser.getCurrentLocation()) + << "the param_operand attribute is a marker reserved for " + << "indicating a value will be passed via params and is only used " + << "in the generic print format"; + } + + keyValuePairs.push_back(NamedAttribute(key, valueAttr)); + return success(); }; - OptionalParseResult parsedOptionsElement = parseOperandOrString(); - while (parsedOptionsElement.has_value()) { - if (failed(parsedOptionsElement.value())) - return failure(); - parsedOptionsElement = parseOperandOrString(); - } + if (parser.parseCommaSeparatedList(AsmParser::Delimiter::Braces, + parseKeyValuePair, + " in options dictionary")) + return failure(); // NB: Attempted parse should have output error message. - if (optionsArray.empty()) { + if (DictionaryAttr::findDuplicate( + keyValuePairs, /*isSorted=*/false) // Also sorts the keyValuePairs. + .has_value()) return parser.emitError(parser.getCurrentLocation()) - << "expected at least one option (either a string or a param)"; - } - options = parser.getBuilder().getArrayAttr(optionsArray); + << "duplicate keys found in options dictionary"; + + options = DictionaryAttr::getWithSorted(parser.getContext(), keyValuePairs); + return success(); } static void printApplyRegisteredPassOptions(OpAsmPrinter &printer, - Operation *op, ArrayAttr options, + Operation *op, + DictionaryAttr options, ValueRange dynamicOptions) { - size_t currentDynamicOptionIdx = 0; - for (auto [idx, optionAttr] : llvm::enumerate(options)) { - if (idx > 0) - printer << " "; // Interleave options separator. + if (options.empty()) + return; - if (isa(optionAttr)) - printer.printOperand(dynamicOptions[currentDynamicOptionIdx++]); - else if (auto strAttr = dyn_cast(optionAttr)) - printer.printAttribute(strAttr); - else - llvm_unreachable("each option should be either a StringAttr or UnitAttr"); - } + printer << "{"; + llvm::interleaveComma(options, printer, [&](NamedAttribute namedAttribute) { + printer << namedAttribute.getName() << " = "; + Attribute value = namedAttribute.getValue(); + if (auto indexAttr = dyn_cast(value)) { + // Resolve index of param-operand to its actual SSA-value and print that. + printer.printOperand(dynamicOptions[indexAttr.getIndex().getInt()]); + } else { + printer.printAttribute(value); + } + }); + printer << "}"; } LogicalResult transform::ApplyRegisteredPassOp::verify() { - size_t numUnitsInOptions = 0; - for (Attribute optionsElement : getOptions()) { - if (isa(optionsElement)) - numUnitsInOptions++; - else if (!isa(optionsElement)) - return emitOpError() << "expected each option to be either a StringAttr " - << "or a UnitAttr, got " << optionsElement; - } - - if (getDynamicOptions().size() != numUnitsInOptions) - return emitOpError() - << "expected the same number of options passed as params as " - << "UnitAttr elements in options ArrayAttr"; + // Check that there is a one-to-one correspondence between param operands + // and references to dynamic options in the options dictionary. + + auto dynamicOptions = SmallVector(getDynamicOptions()); + for (NamedAttribute namedAttr : getOptions()) + if (auto paramOperand = + dyn_cast(namedAttr.getValue())) { + size_t dynamicOptionIdx = paramOperand.getIndex().getInt(); + if (dynamicOptionIdx < 0 || dynamicOptionIdx >= dynamicOptions.size()) + return emitOpError() + << "dynamic option index " << dynamicOptionIdx + << " is out of bounds for the number of dynamic options: " + << dynamicOptions.size(); + if (dynamicOptions[dynamicOptionIdx] == nullptr) + return emitOpError() << "dynamic option index " << dynamicOptionIdx + << " is already used in options"; + dynamicOptions[dynamicOptionIdx] = nullptr; // Mark this option as used. + } + + for (Value dynamicOption : dynamicOptions) + if (dynamicOption) + return emitOpError() << "a param operand does not have a corresponding " + << "param_operand attr in the options dict"; return success(); } diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py index 5b158ec6b65fd..10a04b0cc14e0 100644 --- a/mlir/python/mlir/dialects/transform/__init__.py +++ b/mlir/python/mlir/dialects/transform/__init__.py @@ -18,7 +18,12 @@ except ImportError as e: raise RuntimeError("Error loading imports from extension module") from e -from typing import Optional, Sequence, Union, NewType +from typing import Dict, Optional, Sequence, Union, NewType + + +@register_attribute_builder("ParamOperandAttr") +def _paramOperandAttr(x: int, context) -> Attribute: + return Attribute.parse(f"#transform.param_operand", context=context) @_ods_cext.register_operation(_Dialect, replace=True) @@ -214,6 +219,81 @@ def __init__( super().__init__(_get_op_results_or_values(operands), loc=loc, ip=ip) +@_ods_cext.register_operation(_Dialect, replace=True) +class ApplyRegisteredPassOp(ApplyRegisteredPassOp): + def __init__( + self, + result: Type, + pass_name: Union[str, StringAttr], + target: Union[Operation, Value, OpView], + *, + options: Optional[ + Dict[ + Union[str, StringAttr], + Union[Attribute, Value, Operation, OpView], + ] + ] = None, + loc=None, + ip=None, + ): + options_dict = {} + dynamic_options = [] + + ParamOperandAttr = AttrBuilder.get("ParamOperandAttr") + context = (loc and loc.context) or Context.current + + cur_param_operand_idx = 0 + for key, value in options.items() if options is not None else {}: + if isinstance(key, StringAttr): + key = key.value + + if isinstance(value, (Value, Operation, OpView)): + dynamic_options.append(_get_op_result_or_value(value)) + options_dict[key] = ParamOperandAttr(cur_param_operand_idx, context) + cur_param_operand_idx += 1 + elif isinstance(value, Attribute): + options_dict[key] = value + elif isinstance(value, str): + options_dict[key] = StringAttr.get(value) + else: + raise TypeError(f"Unsupported option type: {type(value)}") + if len(options_dict) > 0: + print(options_dict, cur_param_operand_idx) + super().__init__( + result, + pass_name, + dynamic_options, + target=_get_op_result_or_value(target), + options=DictAttr.get(options_dict), + loc=loc, + ip=ip, + ) + + +def apply_registered_pass( + result: Type, + pass_name: Union[str, StringAttr], + target: Union[Operation, Value, OpView], + *, + options: Optional[ + Dict[ + Union[str, StringAttr], + Union[Attribute, Value, Operation, OpView], + ] + ] = None, + loc=None, + ip=None, +) -> Value: + return ApplyRegisteredPassOp( + result=result, + pass_name=pass_name, + target=target, + options=options, + loc=loc, + ip=ip, + ).result + + AnyOpTypeT = NewType("AnyOpType", AnyOpType) diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir index 463fd98afa65c..6e6d4eb7e249f 100644 --- a/mlir/test/Dialect/Transform/test-pass-application.mlir +++ b/mlir/test/Dialect/Transform/test-pass-application.mlir @@ -80,7 +80,7 @@ module attributes {transform.with_named_sequence} { // expected-error @below {{failed to add pass or pass pipeline to pipeline: canonicalize}} // expected-error @below {{: no such option invalid-option}} transform.apply_registered_pass "canonicalize" - with options = "invalid-option=1" to %1 + with options = { "invalid-option" = 1 } to %1 : (!transform.any_op) -> !transform.any_op transform.yield } @@ -97,7 +97,7 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op transform.apply_registered_pass "canonicalize" - with options = "top-down=false" to %1 + with options = { "top-down" = false } to %1 : (!transform.any_op) -> !transform.any_op transform.yield } @@ -115,7 +115,7 @@ module attributes {transform.with_named_sequence} { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op //transform.apply_registered_pass "canonicalize" with options = "top-down=false,max-iterations=10" to %1 : (!transform.any_op) -> !transform.any_op transform.apply_registered_pass "canonicalize" - with options = "top-down=false test-convergence=true" to %1 + with options = { "top-down" = false, "test-convergence" =true } to %1 : (!transform.any_op) -> !transform.any_op transform.yield } @@ -132,7 +132,7 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op transform.apply_registered_pass "canonicalize" - with options = "top-down=false" "max-iterations=0" to %1 + with options = { "top-down" = false, "max-iterations" = 0 } to %1 : (!transform.any_op) -> !transform.any_op transform.yield } @@ -148,10 +148,15 @@ func.func @valid_dynamic_pass_options() { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %max_iter = transform.param.constant "max-iterations=10" -> !transform.any_param - %max_rewrites = transform.param.constant "max-num-rewrites=1" -> !transform.any_param - %2 = transform.apply_registered_pass "canonicalize" - with options = "top-down=false" %max_iter "test-convergence=true" %max_rewrites to %1 + %max_iter = transform.param.constant 10 -> !transform.any_param + %max_rewrites = transform.param.constant 1 -> !transform.any_param + %2 = transform.apply_registered_pass + "canonicalize" + with options = { "top-down" = false, + "max-iterations" = %max_iter, + "test-convergence" = true, + "max-num-rewrites" = %max_rewrites } + to %1 : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op transform.yield } @@ -159,7 +164,7 @@ module attributes {transform.with_named_sequence} { // ----- -func.func @invalid_dynamic_options_as_array() { +func.func @invalid_options_as_str() { return } @@ -167,34 +172,80 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op %max_iter = transform.param.constant "max-iterations=10" -> !transform.any_param - // expected-error @+2 {{expected at least one option (either a string or a param)}} + // expected-error @+2 {{expected '{' in options dictionary}} %2 = transform.apply_registered_pass "canonicalize" - with options = ["top-down=false" %max_iter] to %1 - : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + with options = "top-down=false" to %1 : (!transform.any_op) -> !transform.any_op transform.yield } } // ----- -func.func @invalid_options_as_pairs() { +func.func @invalid_options_as_pairs_without_braces() { return } module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op - // expected-error @+2 {{expected 'to'}} + // expected-error @+2 {{expected '{' in options dictionary}} %2 = transform.apply_registered_pass "canonicalize" - with options = "top-down=" false to %1 - : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + with options = "top-down"=false to %1 : (!transform.any_op) -> !transform.any_op transform.yield } } // ----- -func.func @invalid_pass_option_param() { +func.func @invalid_options_due_to_reserved_attr() { + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op) { + %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error @+2 {{the param_operand attribute is a marker reserved for indicating a value will be passed via params and is only used in the generic print format}} + %2 = transform.apply_registered_pass "canonicalize" + with options = { "top-down" = #transform.param_operand } to %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +func.func @invalid_options_due_duplicated_key() { + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op) { + %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error @+2 {{duplicate keys found in options dictionary}} + %2 = transform.apply_registered_pass "canonicalize" + with options = {"top-down"=false,"top-down"=true} to %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +func.func @invalid_options_due_invalid_key() { + return +} + +module attributes {transform.with_named_sequence} { + transform.named_sequence @__transform_main(%arg1: !transform.any_op) { + %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op + // expected-error @+2 {{expected key to either be an identifier or a string}} + %2 = transform.apply_registered_pass "canonicalize" + with options = { @label = 0 } to %1 : (!transform.any_op) -> !transform.any_op + transform.yield + } +} + +// ----- + +func.func @invalid_pass_option_bare_param() { return } @@ -202,7 +253,7 @@ module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op %pass_options = transform.param.constant 42 -> !transform.any_param - // expected-error @below {{options passed as a param must be a string, got 42}} + // expected-error @+2 {{expected '{' in options dictionary}} transform.apply_registered_pass "canonicalize" with options = %pass_options to %1 : (!transform.any_param, !transform.any_op) -> !transform.any_op @@ -219,12 +270,12 @@ func.func @too_many_pass_option_params() { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %x = transform.param.constant "x" -> !transform.any_param - %y = transform.param.constant "y" -> !transform.any_param - %pass_options = transform.merge_handles %x, %y : !transform.any_param + %x = transform.param.constant true -> !transform.any_param + %y = transform.param.constant false -> !transform.any_param + %topdown_options = transform.merge_handles %x, %y : !transform.any_param // expected-error @below {{options passed as a param must have a single value associated, param 0 associates 2}} transform.apply_registered_pass "canonicalize" - with options = %pass_options to %1 + with options = { "top-down" = %topdown_options } to %1 : (!transform.any_param, !transform.any_op) -> !transform.any_op transform.yield } @@ -248,3 +299,77 @@ module attributes {transform.with_named_sequence} { transform.yield } } + +// ----- + +///////////////////////////////////////////////////////////////////// +// Check that the following cases are caugh in the generic format. // +///////////////////////////////////////////////////////////////////// + +// Invalid due to param_operand occurences in options dict not being +// one-to-one with the dynamic options provided as params: +// param_operand_index out of bounds w.r.t. the number of options provided via params. + +"builtin.module"() ({ + "transform.named_sequence"() <{function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({ + ^bb0(%arg0: !transform.any_op): + %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op + %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param + // expected-error @below {{dynamic option index 1 is out of bounds for the number of dynamic options: 1}} + %2 = "transform.apply_registered_pass"(%1, %0) <{ + options = {"max-iterations" = #transform.param_operand, + "test-convergence" = true, + "top-down" = false}, + pass_name = "canonicalize"}> + : (!transform.any_param, !transform.any_op) -> !transform.any_op + "transform.yield"() : () -> () + }) : () -> () +}) {transform.with_named_sequence} : () -> () + +// ----- + +// Invalid due to param_operand occurences in options dict not being +// one-to-one with the dynamic options provided as params: +// the first option-param is referred to twice and the second one not at all. +// (In the pretty-printed format, if you want to refer to a param SSA-value twice, it counts as two param arguments.) + +"builtin.module"() ({ + "transform.named_sequence"() <{function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({ + ^bb0(%arg0: !transform.any_op): + %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op + %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param + %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param + // expected-error @below {{dynamic option index 0 is already used in options}} + %3 = "transform.apply_registered_pass"(%1, %2, %0) <{ + options = {"max-iterations" = #transform.param_operand, + "max-num-rewrites" = #transform.param_operand, + "test-convergence" = true, + "top-down" = false}, + pass_name = "canonicalize"}> + : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + "transform.yield"() : () -> () + }) : () -> () +}) {transform.with_named_sequence} : () -> () + +// ----- + +// Invalid due to param_operand occurences in options dict not being +// one-to-one with the dynamic options provided as params: +// two option-params are provide though only the first one is referred to from the options-dict. + +"builtin.module"() ({ + "transform.named_sequence"() <{function_type = (!transform.any_op) -> (), sym_name = "__transform_main"}> ({ + ^bb0(%arg0: !transform.any_op): + %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op + %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param + %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param + // expected-error @below {{a param operand does not have a corresponding param_operand attr in the options dict}} + %3 = "transform.apply_registered_pass"(%1, %2, %0) <{ + options = {"max-iterations" = #transform.param_operand, + "test-convergence" = true, + "top-down" = false}, + pass_name = "canonicalize"}> + : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + "transform.yield"() : () -> () + }) : () -> () +}) {transform.with_named_sequence} : () -> () diff --git a/mlir/test/python/dialects/transform.py b/mlir/test/python/dialects/transform.py index 6ed4818fc9d2f..48bc9bad37a1e 100644 --- a/mlir/test/python/dialects/transform.py +++ b/mlir/test/python/dialects/transform.py @@ -254,3 +254,55 @@ def testReplicateOp(module: Module): # CHECK: %[[FIRST:.+]] = pdl_match # CHECK: %[[SECOND:.+]] = pdl_match # CHECK: %{{.*}} = replicate num(%[[FIRST]]) %[[SECOND]] + + +@run +def testApplyRegisteredPassOp(module: Module): + sequence = transform.SequenceOp( + transform.FailurePropagationMode.Propagate, [], transform.AnyOpType.get() + ) + with InsertionPoint(sequence.body): + mod = transform.ApplyRegisteredPassOp( + transform.AnyOpType.get(), "canonicalize", sequence.bodyTarget + ) + mod = transform.ApplyRegisteredPassOp( + transform.AnyOpType.get(), + "canonicalize", + mod.result, + options={"top-down": BoolAttr.get(False)}, + ) + max_iter = transform.param_constant( + transform.AnyParamType.get(), + IntegerAttr.get(IntegerType.get_signless(64), 10), + ) + max_rewrites = transform.param_constant( + transform.AnyParamType.get(), + IntegerAttr.get(IntegerType.get_signless(64), 1), + ) + transform.apply_registered_pass( + transform.AnyOpType.get(), + "canonicalize", + mod, + options={ + "top-down": BoolAttr.get(False), + "max-iterations": max_iter, + "test-convergence": BoolAttr.get(True), + "max-rewrites": max_rewrites, + }, + ) + transform.YieldOp() + # CHECK-LABEL: TEST: testApplyRegisteredPassOp + # CHECK: transform.sequence + # CHECK: %{{.*}} = apply_registered_pass "canonicalize" to {{.*}} : (!transform.any_op) -> !transform.any_op + # CHECK: %{{.*}} = apply_registered_pass "canonicalize" + # CHECK-SAME: with options = {"top-down" = false} + # CHECK-SAME: to {{.*}} : (!transform.any_op) -> !transform.any_op + # CHECK: %[[MAX_ITER:.+]] = transform.param.constant + # CHECK: %[[MAX_REWRITE:.+]] = transform.param.constant + # CHECK: %{{.*}} = apply_registered_pass "canonicalize" + # NB: MLIR has sorted the dict lexicographically by key: + # CHECK-SAME: with options = {"max-iterations" = %[[MAX_ITER]], + # CHECK-SAME: "max-rewrites" = %[[MAX_REWRITE]], + # CHECK-SAME: "test-convergence" = true, + # CHECK-SAME: "top-down" = false} + # CHECK-SAME: to %{{.*}} : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op From 459475020aeff15d0f886ab99c59d66b744d3e17 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Wed, 11 Jun 2025 16:35:55 +0100 Subject: [PATCH 105/851] Reapply 76197ea6f91f after removing an assertion Specifically this is the assertion in BasicBlock.cpp. Now that we're not examining or setting that flag consistently (because it'll be deleted in about an hour) there's no need to keep this assertion. Original commit title: [DebugInfo][RemoveDIs] Remove some debug intrinsic-only codepaths (#143451) --- llvm/lib/CodeGen/SelectionDAG/FastISel.cpp | 3 - llvm/lib/IR/AutoUpgrade.cpp | 25 ++---- llvm/lib/IR/BasicBlock.cpp | 1 - llvm/lib/IR/DIBuilder.cpp | 97 +++++----------------- llvm/lib/IR/DebugInfo.cpp | 19 +---- llvm/lib/Transforms/Utils/LoopUtils.cpp | 36 +++----- llvm/unittests/IR/IRBuilderTest.cpp | 10 --- 7 files changed, 40 insertions(+), 151 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp index 59cd0dc8dd348..e8a3df3366b2b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -1671,9 +1671,6 @@ void FastISel::fastEmitBranch(MachineBasicBlock *MSucc, const DebugLoc &DbgLoc) { const BasicBlock *BB = FuncInfo.MBB->getBasicBlock(); bool BlockHasMultipleInstrs = &BB->front() != &BB->back(); - // Handle legacy case of debug intrinsics - if (BlockHasMultipleInstrs && !BB->getModule()->IsNewDbgInfoFormat) - BlockHasMultipleInstrs = BB->sizeWithoutDebug() > 1; if (BlockHasMultipleInstrs && FuncInfo.MBB->isLayoutSuccessor(MSucc)) { // For more accurate line information if this is the only non-debug // instruction in the block then emit it, otherwise we have the diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index cb90af36f3d9f..a0886776ff93f 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -4490,7 +4490,6 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { Builder.SetInsertPoint(CI->getParent(), CI->getIterator()); if (!NewFn) { - bool FallthroughToDefaultUpgrade = false; // Get the Function's name. StringRef Name = F->getName(); @@ -4518,29 +4517,15 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { } else if (IsAMDGCN) { Rep = upgradeAMDGCNIntrinsicCall(Name, CI, F, Builder); } else if (IsDbg) { - // We might have decided we don't want the new format after all between - // first requesting the upgrade and now; skip the conversion if that is - // the case, and check here to see if the intrinsic needs to be upgraded - // normally. - if (!CI->getModule()->IsNewDbgInfoFormat) { - bool NeedsUpgrade = - upgradeIntrinsicFunction1(CI->getCalledFunction(), NewFn, false); - if (!NeedsUpgrade) - return; - FallthroughToDefaultUpgrade = true; - } else { - upgradeDbgIntrinsicToDbgRecord(Name, CI); - } + upgradeDbgIntrinsicToDbgRecord(Name, CI); } else { llvm_unreachable("Unknown function for CallBase upgrade."); } - if (!FallthroughToDefaultUpgrade) { - if (Rep) - CI->replaceAllUsesWith(Rep); - CI->eraseFromParent(); - return; - } + if (Rep) + CI->replaceAllUsesWith(Rep); + CI->eraseFromParent(); + return; } const auto &DefaultCase = [&]() -> void { diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp index f716e9970b841..62a75313bb171 100644 --- a/llvm/lib/IR/BasicBlock.cpp +++ b/llvm/lib/IR/BasicBlock.cpp @@ -60,7 +60,6 @@ void BasicBlock::convertToNewDbgValues() { // instruction. SmallVector DbgVarRecs; for (Instruction &I : make_early_inc_range(InstList)) { - assert(!I.DebugMarker && "DebugMarker already set on old-format instrs?"); if (DbgVariableIntrinsic *DVI = dyn_cast(&I)) { // Convert this dbg.value to a DbgVariableRecord. DbgVariableRecord *Value = new DbgVariableRecord(DVI); diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 5e5ff22132e99..1484c549dd580 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -1047,36 +1047,13 @@ DbgInstPtr DIBuilder::insertDbgAssign(Instruction *LinkedInstr, Value *Val, LinkedInstr->getMetadata(LLVMContext::MD_DIAssignID)); assert(Link && "Linked instruction must have DIAssign metadata attached"); - if (M.IsNewDbgInfoFormat) { - DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign( - Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL); - // Insert after LinkedInstr. - BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator()); - NextIt.setHeadBit(true); - insertDbgVariableRecord(DVR, NextIt); - return DVR; - } - - LLVMContext &Ctx = LinkedInstr->getContext(); - Module *M = LinkedInstr->getModule(); - if (!AssignFn) - AssignFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::dbg_assign); - - std::array Args = { - MetadataAsValue::get(Ctx, ValueAsMetadata::get(Val)), - MetadataAsValue::get(Ctx, SrcVar), - MetadataAsValue::get(Ctx, ValExpr), - MetadataAsValue::get(Ctx, Link), - MetadataAsValue::get(Ctx, ValueAsMetadata::get(Addr)), - MetadataAsValue::get(Ctx, AddrExpr), - }; - - IRBuilder<> B(Ctx); - B.SetCurrentDebugLocation(DL); - - auto *DVI = cast(B.CreateCall(AssignFn, Args)); - DVI->insertAfter(LinkedInstr->getIterator()); - return DVI; + DbgVariableRecord *DVR = DbgVariableRecord::createDVRAssign( + Val, SrcVar, ValExpr, Link, Addr, AddrExpr, DL); + // Insert after LinkedInstr. + BasicBlock::iterator NextIt = std::next(LinkedInstr->getIterator()); + NextIt.setHeadBit(true); + insertDbgVariableRecord(DVR, NextIt); + return DVR; } /// Initialize IRBuilder for inserting dbg.declare and dbg.value intrinsics. @@ -1101,18 +1078,10 @@ DbgInstPtr DIBuilder::insertDbgValueIntrinsic(llvm::Value *Val, DIExpression *Expr, const DILocation *DL, InsertPosition InsertPt) { - if (M.IsNewDbgInfoFormat) { - DbgVariableRecord *DVR = - DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL); - insertDbgVariableRecord(DVR, InsertPt); - return DVR; - } - - if (!ValueFn) - ValueFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_value); - auto *DVI = insertDbgIntrinsic(ValueFn, Val, VarInfo, Expr, DL, InsertPt); - cast(DVI)->setTailCall(); - return DVI; + DbgVariableRecord *DVR = + DbgVariableRecord::createDbgVariableRecord(Val, VarInfo, Expr, DL); + insertDbgVariableRecord(DVR, InsertPt); + return DVR; } DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, @@ -1124,25 +1093,10 @@ DbgInstPtr DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo, VarInfo->getScope()->getSubprogram() && "Expected matching subprograms"); - if (M.IsNewDbgInfoFormat) { - DbgVariableRecord *DVR = - DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL); - insertDbgVariableRecord(DVR, InsertPt); - return DVR; - } - - if (!DeclareFn) - DeclareFn = getDeclareIntrin(M); - - trackIfUnresolved(VarInfo); - trackIfUnresolved(Expr); - Value *Args[] = {getDbgIntrinsicValueImpl(VMContext, Storage), - MetadataAsValue::get(VMContext, VarInfo), - MetadataAsValue::get(VMContext, Expr)}; - - IRBuilder<> B(DL->getContext()); - initIRBuilder(B, DL, InsertPt); - return B.CreateCall(DeclareFn, Args); + DbgVariableRecord *DVR = + DbgVariableRecord::createDVRDeclare(Storage, VarInfo, Expr, DL); + insertDbgVariableRecord(DVR, InsertPt); + return DVR; } void DIBuilder::insertDbgVariableRecord(DbgVariableRecord *DVR, @@ -1191,23 +1145,12 @@ DbgInstPtr DIBuilder::insertLabel(DILabel *LabelInfo, const DILocation *DL, "Expected matching subprograms"); trackIfUnresolved(LabelInfo); - if (M.IsNewDbgInfoFormat) { - DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL); - if (InsertPt.isValid()) { - auto *BB = InsertPt.getBasicBlock(); - BB->insertDbgRecordBefore(DLR, InsertPt); - } - return DLR; + DbgLabelRecord *DLR = new DbgLabelRecord(LabelInfo, DL); + if (InsertPt.isValid()) { + auto *BB = InsertPt.getBasicBlock(); + BB->insertDbgRecordBefore(DLR, InsertPt); } - - if (!LabelFn) - LabelFn = Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_label); - - Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)}; - - IRBuilder<> B(DL->getContext()); - initIRBuilder(B, DL, InsertPt); - return B.CreateCall(LabelFn, Args); + return DLR; } void DIBuilder::replaceVTableHolder(DICompositeType *&T, DIType *VTableHolder) { diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp index 7db9891fdbd75..2a84e7bae0f10 100644 --- a/llvm/lib/IR/DebugInfo.cpp +++ b/llvm/lib/IR/DebugInfo.cpp @@ -2123,22 +2123,11 @@ static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest, Expr = *R; } DIExpression *AddrExpr = DIExpression::get(StoreLikeInst.getContext(), {}); - if (StoreLikeInst.getParent()->IsNewDbgInfoFormat) { - auto *Assign = DbgVariableRecord::createLinkedDVRAssign( - &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL); - (void)Assign; - LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n"); - return; - } - auto Assign = DIB.insertDbgAssign(&StoreLikeInst, Val, VarRec.Var, Expr, Dest, - AddrExpr, VarRec.DL); + auto *Assign = DbgVariableRecord::createLinkedDVRAssign( + &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL); (void)Assign; - LLVM_DEBUG(if (!Assign.isNull()) { - if (const auto *Record = dyn_cast(Assign)) - errs() << " > INSERT: " << *Record << "\n"; - else - errs() << " > INSERT: " << *cast(Assign) << "\n"; - }); + LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n"); + return; } #undef DEBUG_TYPE // Silence redefinition warning (from ConstantsContext.h). diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index 0681ebc111cb2..ff69fa9f70c4e 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -606,7 +606,6 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, // Use a map to unique and a vector to guarantee deterministic ordering. llvm::SmallDenseSet DeadDebugSet; - llvm::SmallVector DeadDebugInst; llvm::SmallVector DeadDbgVariableRecords; if (ExitBlock) { @@ -633,29 +632,19 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, U.set(Poison); } - // RemoveDIs: do the same as below for DbgVariableRecords. - if (Block->IsNewDbgInfoFormat) { - for (DbgVariableRecord &DVR : llvm::make_early_inc_range( - filterDbgVars(I.getDbgRecordRange()))) { - DebugVariable Key(DVR.getVariable(), DVR.getExpression(), - DVR.getDebugLoc().get()); - if (!DeadDebugSet.insert(Key).second) - continue; - // Unlinks the DVR from it's container, for later insertion. - DVR.removeFromParent(); - DeadDbgVariableRecords.push_back(&DVR); - } - } - - // For one of each variable encountered, preserve a debug intrinsic (set + // For one of each variable encountered, preserve a debug record (set // to Poison) and transfer it to the loop exit. This terminates any // variable locations that were set during the loop. - auto *DVI = dyn_cast(&I); - if (!DVI) - continue; - if (!DeadDebugSet.insert(DebugVariable(DVI)).second) - continue; - DeadDebugInst.push_back(DVI); + for (DbgVariableRecord &DVR : + llvm::make_early_inc_range(filterDbgVars(I.getDbgRecordRange()))) { + DebugVariable Key(DVR.getVariable(), DVR.getExpression(), + DVR.getDebugLoc().get()); + if (!DeadDebugSet.insert(Key).second) + continue; + // Unlinks the DVR from it's container, for later insertion. + DVR.removeFromParent(); + DeadDbgVariableRecords.push_back(&DVR); + } } // After the loop has been deleted all the values defined and modified @@ -671,9 +660,6 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE, "There should be a non-PHI instruction in exit block, else these " "instructions will have no parent."); - for (auto *DVI : DeadDebugInst) - DVI->moveBefore(*ExitBlock, InsertDbgValueBefore); - // Due to the "head" bit in BasicBlock::iterator, we're going to insert // each DbgVariableRecord right at the start of the block, wheras dbg.values // would be repeatedly inserted before the first instruction. To replicate diff --git a/llvm/unittests/IR/IRBuilderTest.cpp b/llvm/unittests/IR/IRBuilderTest.cpp index 3a7ba924792ef..aadae5287c380 100644 --- a/llvm/unittests/IR/IRBuilderTest.cpp +++ b/llvm/unittests/IR/IRBuilderTest.cpp @@ -1003,18 +1003,8 @@ TEST_F(IRBuilderTest, DIBuilder) { EXPECT_TRUE(verifyModule(*M)); }; - // Test in new-debug mode. - EXPECT_TRUE(M->IsNewDbgInfoFormat); RunTest(); - - // Test in old-debug mode. - // Reset the test then call convertFromNewDbgValues to flip the flag - // on the test's Module, Function and BasicBlock. TearDown(); - SetUp(); - M->convertFromNewDbgValues(); - EXPECT_FALSE(M->IsNewDbgInfoFormat); - RunTest(); } TEST_F(IRBuilderTest, createArtificialSubprogram) { From f1575de4c5de9268f92eea1641af755a477e4ee4 Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 11 Jun 2025 11:37:12 -0500 Subject: [PATCH 106/851] [libc][NFC] Remove template from GPU allocator reference counter Summary: We don't need this to be generic, precommit for https://github.com/llvm/llvm-project/pull/143607 --- libc/src/__support/GPU/allocator.cpp | 32 ++++++++++++++-------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp index 135ced3df704c..ecc0de1cb6ec3 100644 --- a/libc/src/__support/GPU/allocator.cpp +++ b/libc/src/__support/GPU/allocator.cpp @@ -283,7 +283,7 @@ struct Slab { /// A wait-free guard around a pointer resource to be created dynamically if /// space is available and freed once there are no more users. -template struct GuardPtr { +struct GuardPtr { private: struct RefCounter { // Indicates that the object is in its deallocation phase and thus invalid. @@ -339,22 +339,22 @@ template struct GuardPtr { cpp::Atomic counter{0}; }; - cpp::Atomic ptr{nullptr}; + cpp::Atomic ptr{nullptr}; RefCounter ref{}; // Should be called be a single lane for each different pointer. template - T *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) { - T *expected = ptr.load(cpp::MemoryOrder::RELAXED); + Slab *try_lock_impl(uint32_t n, uint64_t &count, Args &&...args) { + Slab *expected = ptr.load(cpp::MemoryOrder::RELAXED); if (!expected && - ptr.compare_exchange_strong(expected, reinterpret_cast(SENTINEL), - cpp::MemoryOrder::RELAXED, - cpp::MemoryOrder::RELAXED)) { + ptr.compare_exchange_strong( + expected, reinterpret_cast(SENTINEL), + cpp::MemoryOrder::RELAXED, cpp::MemoryOrder::RELAXED)) { count = cpp::numeric_limits::max(); - void *raw = impl::rpc_allocate(sizeof(T)); + void *raw = impl::rpc_allocate(sizeof(Slab)); if (!raw) return nullptr; - T *mem = new (raw) T(cpp::forward(args)...); + Slab *mem = new (raw) Slab(cpp::forward(args)...); cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); ptr.store(mem, cpp::MemoryOrder::RELAXED); @@ -364,7 +364,7 @@ template struct GuardPtr { return mem; } - if (!expected || expected == reinterpret_cast(SENTINEL)) + if (!expected || expected == reinterpret_cast(SENTINEL)) return nullptr; if (!ref.acquire(n, count)) @@ -379,10 +379,10 @@ template struct GuardPtr { // The uniform mask represents which lanes share the same pointer. For each // uniform value we elect a leader to handle it on behalf of the other lanes. template - T *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count, - Args &&...args) { + Slab *try_lock(uint64_t lane_mask, uint64_t uniform, uint64_t &count, + Args &&...args) { count = 0; - T *result = nullptr; + Slab *result = nullptr; if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform))) result = try_lock_impl(cpp::popcount(uniform), count, cpp::forward(args)...); @@ -403,8 +403,8 @@ template struct GuardPtr { cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(mask)) && ref.release(cpp::popcount(mask))) { - T *p = ptr.load(cpp::MemoryOrder::RELAXED); - p->~T(); + Slab *p = ptr.load(cpp::MemoryOrder::RELAXED); + p->~Slab(); impl::rpc_free(p); cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); ptr.store(nullptr, cpp::MemoryOrder::RELAXED); @@ -417,7 +417,7 @@ template struct GuardPtr { }; // The global array used to search for a valid slab to allocate from. -static GuardPtr slots[ARRAY_SIZE] = {}; +static GuardPtr slots[ARRAY_SIZE] = {}; // Tries to find a slab in the table that can support the given chunk size. static Slab *find_slab(uint32_t chunk_size) { From aa8a1fa6f515f45db55365b9c1f8453ded24ed32 Mon Sep 17 00:00:00 2001 From: Stephen Tozer Date: Wed, 11 Jun 2025 18:42:10 +0200 Subject: [PATCH 107/851] [DLCov][NFC] Annotate intentionally-blank DebugLocs in existing code (#136192) Following the work in PR #107279, this patch applies the annotative DebugLocs, which indicate that a particular instruction is intentionally missing a location for a given reason, to existing sites in the compiler where their conditions apply. This is NFC in ordinary LLVM builds (each function `DebugLoc::getFoo()` is inlined as `DebugLoc()`), but marks the instruction in coverage-tracking builds so that it will be ignored by Debugify, allowing only real errors to be reported. From a developer standpoint, it also communicates the intentionality and reason for a missing DebugLoc. Some notes for reviewers: - The difference between `I->dropLocation()` and `I->setDebugLoc(DebugLoc::getDropped())` is that the former _may_ decide to keep some debug info alive, while the latter will always be empty; in this patch, I always used the latter (even if the former could technically be correct), because the former could result in some (barely) different output, and I'd prefer to keep this patch purely NFC. - I've generally documented the uses of `DebugLoc::getUnknown()`, with the exception of the vectorizers - in summary, they are a huge cause of dropped source locations, and I don't have the time or the domain knowledge currently to solve that, so I've plastered it all over them as a form of "fixme". --- llvm/lib/Transforms/IPO/GlobalOpt.cpp | 10 ++++-- llvm/lib/Transforms/IPO/IROutliner.cpp | 4 +-- .../Transforms/InstCombine/InstCombinePHI.cpp | 9 ++++- .../Scalar/CorrelatedValuePropagation.cpp | 3 +- llvm/lib/Transforms/Scalar/IndVarSimplify.cpp | 3 ++ llvm/lib/Transforms/Scalar/JumpThreading.cpp | 4 ++- llvm/lib/Transforms/Scalar/LICM.cpp | 4 ++- .../Transforms/Scalar/LoopLoadElimination.cpp | 3 +- .../Transforms/Scalar/SimpleLoopUnswitch.cpp | 3 ++ .../Scalar/TailRecursionElimination.cpp | 4 ++- llvm/lib/Transforms/Utils/InlineFunction.cpp | 9 +++++ llvm/lib/Transforms/Utils/Local.cpp | 3 +- llvm/lib/Transforms/Utils/SCCPSolver.cpp | 4 ++- llvm/lib/Transforms/Utils/SSAUpdater.cpp | 5 +++ llvm/lib/Transforms/Utils/SimplifyCFG.cpp | 10 +++--- .../Vectorize/LoopVectorizationPlanner.h | 34 ++++++++++++------- .../Transforms/Vectorize/LoopVectorize.cpp | 8 +++-- .../Transforms/Vectorize/SLPVectorizer.cpp | 12 +++++-- llvm/lib/Transforms/Vectorize/VPlan.h | 6 ++-- 19 files changed, 101 insertions(+), 37 deletions(-) diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp index b3fe0ab8b5cb4..7db0586386506 100644 --- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp +++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp @@ -1494,8 +1494,14 @@ processInternalGlobal(GlobalVariable *GV, const GlobalStatus &GS, // FIXME: Pass Global's alignment when globals have alignment AllocaInst *Alloca = new AllocaInst(ElemTy, DL.getAllocaAddrSpace(), nullptr, GV->getName(), FirstI); - if (!isa(GV->getInitializer())) - new StoreInst(GV->getInitializer(), Alloca, FirstI); + Alloca->setDebugLoc(DebugLoc::getCompilerGenerated()); + if (!isa(GV->getInitializer())) { + auto *SI = new StoreInst(GV->getInitializer(), Alloca, FirstI); + // FIXME: We're localizing a global and creating a store instruction for + // the initial value of that global. Could we logically use the global + // variable's (if one exists) line for this? + SI->setDebugLoc(DebugLoc::getCompilerGenerated()); + } GV->replaceAllUsesWith(Alloca); GV->eraseFromParent(); diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp index ff66a518be752..cb18b55ae2183 100644 --- a/llvm/lib/Transforms/IPO/IROutliner.cpp +++ b/llvm/lib/Transforms/IPO/IROutliner.cpp @@ -730,7 +730,7 @@ static void moveFunctionData(Function &Old, Function &New, // other outlined instructions. if (!isa(&Val)) { // Remove the debug information for outlined functions. - Val.setDebugLoc(DebugLoc()); + Val.setDebugLoc(DebugLoc::getDropped()); // Loop info metadata may contain line locations. Update them to have no // value in the new subprogram since the outlined code could be from @@ -1864,7 +1864,7 @@ replaceArgumentUses(OutlinableRegion &Region, Value *ValueOperand = SI->getValueOperand(); StoreInst *NewI = cast(I->clone()); - NewI->setDebugLoc(DebugLoc()); + NewI->setDebugLoc(DebugLoc::getDropped()); BasicBlock *OutputBB = VBBIt->second; NewI->insertInto(OutputBB, OutputBB->end()); LLVM_DEBUG(dbgs() << "Move store for instruction " << *I << " to " diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp index a842a5edcb8a3..6477141ab095f 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp @@ -870,7 +870,14 @@ Instruction *InstCombinerImpl::foldPHIArgZextsIntoPHI(PHINode &Phi) { NewPhi->addIncoming(NewIncoming[I], Phi.getIncomingBlock(I)); InsertNewInstBefore(NewPhi, Phi.getIterator()); - return CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType()); + auto *CI = CastInst::CreateZExtOrBitCast(NewPhi, Phi.getType()); + + // We use a dropped location here because the new ZExt is necessarily a merge + // of ZExtInsts and at least one constant from incoming branches; the presence + // of the constant means we have no viable DebugLoc from that branch, and + // therefore we must use a dropped location. + CI->setDebugLoc(DebugLoc::getDropped()); + return CI; } /// If all operands to a PHI node are the same "unary" operator and they all are diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index b95a851c99b49..4627f537dc16b 100644 --- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -432,7 +432,8 @@ static bool processSwitch(SwitchInst *I, LazyValueInfo *LVI, BasicBlock *NewUnreachableBB = BasicBlock::Create(BB->getContext(), "default.unreachable", BB->getParent(), DefaultDest); - new UnreachableInst(BB->getContext(), NewUnreachableBB); + auto *UI = new UnreachableInst(BB->getContext(), NewUnreachableBB); + UI->setDebugLoc(DebugLoc::getTemporary()); DefaultDest->removePredecessor(BB); SI->setDefaultDest(NewUnreachableBB); diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp index 95d52b9b4e189..334c911191cb8 100644 --- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp +++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp @@ -1506,6 +1506,9 @@ bool IndVarSimplify::canonicalizeExitCondition(Loop *L) { auto *NewRHS = CastInst::Create( Instruction::Trunc, RHS, LHSOp->getType(), "", L->getLoopPreheader()->getTerminator()->getIterator()); + // NewRHS is an operation that has been hoisted out of the loop, and + // therefore should have a dropped location. + NewRHS->setDebugLoc(DebugLoc::getDropped()); ICmp->setOperand(Swapped ? 1 : 0, LHSOp); ICmp->setOperand(Swapped ? 0 : 1, NewRHS); // Samesign flag cannot be preserved after narrowing the compare. diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp index 9449b4cb35b93..37b85bf9de811 100644 --- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp +++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp @@ -3001,8 +3001,10 @@ bool JumpThreadingPass::tryToUnfoldSelectInCurrBB(BasicBlock *BB) { continue; // Expand the select. Value *Cond = SI->getCondition(); - if (!isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI)) + if (!isGuaranteedNotToBeUndefOrPoison(Cond, nullptr, SI)) { Cond = new FreezeInst(Cond, "cond.fr", SI->getIterator()); + cast(Cond)->setDebugLoc(DebugLoc::getTemporary()); + } MDNode *BranchWeights = getBranchWeightMDNode(*SI); Instruction *Term = SplitBlockAndInsertIfThen(Cond, SI, false, BranchWeights); diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 9773ef778b690..3024ccb330b1a 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -2248,7 +2248,7 @@ bool llvm::promoteLoopAccessesToScalars( if (SawUnorderedAtomic) PreheaderLoad->setOrdering(AtomicOrdering::Unordered); PreheaderLoad->setAlignment(Alignment); - PreheaderLoad->setDebugLoc(DebugLoc()); + PreheaderLoad->setDebugLoc(DebugLoc::getDropped()); if (AATags && LoadIsGuaranteedToExecute) PreheaderLoad->setAAMetadata(AATags); @@ -2808,6 +2808,7 @@ static bool hoistMulAddAssociation(Instruction &I, Loop &L, auto *NewBO = BinaryOperator::Create(Ins->getOpcode(), LHS, RHS, Ins->getName() + ".reass", Ins->getIterator()); + NewBO->setDebugLoc(DebugLoc::getDropped()); NewBO->copyIRFlags(Ins); if (VariantOp == Ins) VariantOp = NewBO; @@ -2864,6 +2865,7 @@ static bool hoistBOAssociation(Instruction &I, Loop &L, auto *NewBO = BinaryOperator::Create( Opcode, LV, Inv, BO->getName() + ".reass", BO->getIterator()); + NewBO->setDebugLoc(DebugLoc::getDropped()); if (Opcode == Instruction::FAdd || Opcode == Instruction::FMul) { // Intersect FMF flags for FADD and FMUL. diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp index 39e8d702a692e..6bdf76f789a49 100644 --- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp +++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp @@ -442,7 +442,7 @@ class LoadEliminationForLoop { assert(PH && "Preheader should exist!"); Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(), PH->getTerminator()); - Value *Initial = + Instruction *Initial = new LoadInst(Cand.Load->getType(), InitialPtr, "load_initial", /* isVolatile */ false, Cand.Load->getAlign(), PH->getTerminator()->getIterator()); @@ -450,6 +450,7 @@ class LoadEliminationForLoop { // into the loop's preheader. A debug location inside the loop will cause // a misleading stepping when debugging. The test update-debugloc-store // -forwarded.ll checks this. + Initial->setDebugLoc(DebugLoc::getDropped()); PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded"); PHI->insertBefore(L->getHeader()->begin()); diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index 0bf90036b8b82..9b40fc03da6bb 100644 --- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -274,6 +274,7 @@ static void buildPartialUnswitchConditionalBranch( BasicBlock &UnswitchedSucc, BasicBlock &NormalSucc, bool InsertFreeze, const Instruction *I, AssumptionCache *AC, const DominatorTree &DT) { IRBuilder<> IRB(&BB); + IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); SmallVector FrozenInvariants; for (Value *Inv : Invariants) { @@ -330,6 +331,7 @@ static void buildPartialInvariantUnswitchConditionalBranch( } IRBuilder<> IRB(&BB); + IRB.SetCurrentDebugLocation(DebugLoc::getCompilerGenerated()); Value *Cond = VMap[ToDuplicate[0]]; IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc, Direction ? &NormalSucc : &UnswitchedSucc); @@ -2369,6 +2371,7 @@ static void unswitchNontrivialInvariants( // BI (`dyn_cast(TI)`) is an in-loop instruction hoisted // out of the loop. Cond = new FreezeInst(Cond, Cond->getName() + ".fr", BI->getIterator()); + cast(Cond)->setDebugLoc(DebugLoc::getDropped()); } BI->setCondition(Cond); DTUpdates.push_back({DominatorTree::Insert, SplitBB, ClonedPH}); diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp index 7dd6c60370ed9..c71c5a70a12fd 100644 --- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp +++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp @@ -515,7 +515,8 @@ void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) { BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "", &F, HeaderBB); NewEntry->takeName(HeaderBB); HeaderBB->setName("tailrecurse"); - BranchInst::Create(HeaderBB, NewEntry); + auto *BI = BranchInst::Create(HeaderBB, NewEntry); + BI->setDebugLoc(DebugLoc::getCompilerGenerated()); // If the new branch preserves the debug location of CI, it could result in // misleading stepping, if CI is located in a conditional branch. // So, here we don't give any debug location to the new branch. @@ -801,6 +802,7 @@ void TailRecursionEliminator::cleanupAndFinalize() { SelectInst *SI = SelectInst::Create(RetKnownPN, RetPN, RI->getOperand(0), "current.ret.tr", RI->getIterator()); + SI->setDebugLoc(DebugLoc::getCompilerGenerated()); RetSelects.push_back(SI); RI->setOperand(0, SI); } diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp index 7a9605bf5f8d4..f47c467d15140 100644 --- a/llvm/lib/Transforms/Utils/InlineFunction.cpp +++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp @@ -1775,6 +1775,7 @@ static Value *HandleByValArgument(Type *ByValType, Value *Arg, AllocaInst *NewAlloca = new AllocaInst(ByValType, Arg->getType()->getPointerAddressSpace(), nullptr, Alignment, Arg->getName()); + NewAlloca->setDebugLoc(DebugLoc::getCompilerGenerated()); NewAlloca->insertBefore(Caller->begin()->begin()); IFI.StaticAllocas.push_back(NewAlloca); @@ -3258,6 +3259,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, // Add an unconditional branch to make this look like the CallInst case... CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), CB.getIterator()); + // We intend to replace this DebugLoc with another later. + CreatedBranchToNormalDest->setDebugLoc(DebugLoc::getTemporary()); // Split the basic block. This guarantees that no PHI nodes will have to be // updated due to new incoming edges, and make the invoke case more @@ -3359,6 +3362,12 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI, Returns[0]->eraseFromParent(); ReturnBB->eraseFromParent(); } else if (!CB.use_empty()) { + // In this case there are no returns to use, so there is no clear source + // location for the "return". + // FIXME: It may be correct to use the scope end line of the function here, + // since this likely means we are falling out of the function. + if (CreatedBranchToNormalDest) + CreatedBranchToNormalDest->setDebugLoc(DebugLoc::getUnknown()); // No returns, but something is using the return value of the call. Just // nuke the result. CB.replaceAllUsesWith(PoisonValue::get(CB.getType())); diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp index 2630a1a7a6af4..a3252a69874d3 100644 --- a/llvm/lib/Transforms/Utils/Local.cpp +++ b/llvm/lib/Transforms/Utils/Local.cpp @@ -3127,7 +3127,8 @@ static bool markAliveBlocks(Function &F, BasicBlock *UnreachableNormalDest = BasicBlock::Create( Ctx, OrigNormalDest->getName() + ".unreachable", II->getFunction(), OrigNormalDest); - new UnreachableInst(Ctx, UnreachableNormalDest); + auto *UI = new UnreachableInst(Ctx, UnreachableNormalDest); + UI->setDebugLoc(DebugLoc::getTemporary()); II->setNormalDest(UnreachableNormalDest); if (DTU) DTU->applyUpdates( diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp index 1a2e422356270..f4b378b82daec 100644 --- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp +++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp @@ -348,7 +348,9 @@ bool SCCPSolver::removeNonFeasibleEdges(BasicBlock *BB, DomTreeUpdater &DTU, NewUnreachableBB = BasicBlock::Create(DefaultDest->getContext(), "default.unreachable", DefaultDest->getParent(), DefaultDest); - new UnreachableInst(DefaultDest->getContext(), NewUnreachableBB); + auto *UI = + new UnreachableInst(DefaultDest->getContext(), NewUnreachableBB); + UI->setDebugLoc(DebugLoc::getTemporary()); } DefaultDest->removePredecessor(BB); diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp index 48d9528f0c3df..5db7fc956c497 100644 --- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp +++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp @@ -318,6 +318,11 @@ class SSAUpdaterTraits { SSAUpdater *Updater) { PHINode *PHI = PHINode::Create(Updater->ProtoType, NumPreds, Updater->ProtoName); + // FIXME: Ordinarily we don't care about or try to assign DebugLocs to PHI + // nodes, but loop optimizations may try to use a PHI node as a DebugLoc + // source (e.g. if this is an induction variable), and it's not clear what + // location we could attach here, so mark this unknown for now. + PHI->setDebugLoc(DebugLoc::getUnknown()); PHI->insertBefore(BB->begin()); return PHI; } diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp index e221022bb8361..975ce3bef5176 100644 --- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp +++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp @@ -1137,7 +1137,7 @@ static void cloneInstructionsIntoPredecessorBlockAndUpdateSSAUses( // branch, drop it. When we fold the bonus instructions we want to make // sure we reset their debug locations in order to avoid stepping on // dead code caused by folding dead branches. - NewBonusInst->setDebugLoc(DebugLoc()); + NewBonusInst->setDebugLoc(DebugLoc::getDropped()); } else if (const DebugLoc &DL = NewBonusInst->getDebugLoc()) { mapAtomInstance(DL, VMap); } @@ -2821,7 +2821,8 @@ static void mergeCompatibleInvokesImpl(ArrayRef Invokes, // so just form a new block with unreachable terminator. BasicBlock *MergedNormalDest = BasicBlock::Create( Ctx, II0BB->getName() + ".cont", Func, InsertBeforeBlock); - new UnreachableInst(Ctx, MergedNormalDest); + auto *UI = new UnreachableInst(Ctx, MergedNormalDest); + UI->setDebugLoc(DebugLoc::getTemporary()); MergedInvoke->setNormalDest(MergedNormalDest); } @@ -3389,7 +3390,7 @@ bool SimplifyCFGOpt::speculativelyExecuteBB(BranchInst *BI, if (!SpeculatedStoreValue || &I != SpeculatedStore) { // Don't update the DILocation of dbg.assign intrinsics. if (!isa(&I)) - I.setDebugLoc(DebugLoc()); + I.setDebugLoc(DebugLoc::getDropped()); } I.dropUBImplyingAttrsAndMetadata(); @@ -5707,7 +5708,8 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch, BasicBlock *NewDefaultBlock = BasicBlock::Create( BB->getContext(), BB->getName() + ".unreachabledefault", BB->getParent(), OrigDefaultBlock); - new UnreachableInst(Switch->getContext(), NewDefaultBlock); + auto *UI = new UnreachableInst(Switch->getContext(), NewDefaultBlock); + UI->setDebugLoc(DebugLoc::getTemporary()); Switch->setDefaultDest(&*NewDefaultBlock); if (DTU) { SmallVector Updates; diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h index b81d582f07e88..70f541d64b305 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h +++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h @@ -153,7 +153,7 @@ class VPBuilder { VPInstruction *createNaryOp(unsigned Opcode, ArrayRef Operands, Instruction *Inst = nullptr, const Twine &Name = "") { - DebugLoc DL; + DebugLoc DL = DebugLoc::getUnknown(); if (Inst) DL = Inst->getDebugLoc(); VPInstruction *NewVPInst = createInstruction(Opcode, Operands, DL, Name); @@ -165,7 +165,8 @@ class VPBuilder { return createInstruction(Opcode, Operands, DL, Name); } VPInstruction *createNaryOp(unsigned Opcode, ArrayRef Operands, - const VPIRFlags &Flags, DebugLoc DL = {}, + const VPIRFlags &Flags, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(Opcode, Operands, Flags, DL, Name)); @@ -174,7 +175,8 @@ class VPBuilder { VPInstruction *createNaryOp(unsigned Opcode, std::initializer_list Operands, Type *ResultTy, const VPIRFlags &Flags = {}, - DebugLoc DL = {}, const Twine &Name = "") { + DebugLoc DL = DebugLoc::getUnknown(), + const Twine &Name = "") { return tryInsertInstruction( new VPInstructionWithType(Opcode, Operands, ResultTy, Flags, DL, Name)); } @@ -182,22 +184,25 @@ class VPBuilder { VPInstruction *createOverflowingOp(unsigned Opcode, std::initializer_list Operands, VPRecipeWithIRFlags::WrapFlagsTy WrapFlags, - DebugLoc DL = {}, const Twine &Name = "") { + DebugLoc DL = DebugLoc::getUnknown(), + const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(Opcode, Operands, WrapFlags, DL, Name)); } - VPValue *createNot(VPValue *Operand, DebugLoc DL = {}, + VPValue *createNot(VPValue *Operand, DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return createInstruction(VPInstruction::Not, {Operand}, DL, Name); } - VPValue *createAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, + VPValue *createAnd(VPValue *LHS, VPValue *RHS, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return createInstruction(Instruction::BinaryOps::And, {LHS, RHS}, DL, Name); } - VPValue *createOr(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, + VPValue *createOr(VPValue *LHS, VPValue *RHS, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction(new VPInstruction( @@ -205,14 +210,16 @@ class VPBuilder { VPRecipeWithIRFlags::DisjointFlagsTy(false), DL, Name)); } - VPValue *createLogicalAnd(VPValue *LHS, VPValue *RHS, DebugLoc DL = {}, + VPValue *createLogicalAnd(VPValue *LHS, VPValue *RHS, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::LogicalAnd, {LHS, RHS}, DL, Name)); } VPValue *createSelect(VPValue *Cond, VPValue *TrueVal, VPValue *FalseVal, - DebugLoc DL = {}, const Twine &Name = "", + DebugLoc DL = DebugLoc::getUnknown(), + const Twine &Name = "", std::optional FMFs = std::nullopt) { auto *Select = FMFs ? new VPInstruction(Instruction::Select, {Cond, TrueVal, FalseVal}, @@ -226,20 +233,23 @@ class VPBuilder { /// and \p B. /// TODO: add createFCmp when needed. VPValue *createICmp(CmpInst::Predicate Pred, VPValue *A, VPValue *B, - DebugLoc DL = {}, const Twine &Name = "") { + DebugLoc DL = DebugLoc::getUnknown(), + const Twine &Name = "") { assert(Pred >= CmpInst::FIRST_ICMP_PREDICATE && Pred <= CmpInst::LAST_ICMP_PREDICATE && "invalid predicate"); return tryInsertInstruction( new VPInstruction(Instruction::ICmp, {A, B}, Pred, DL, Name)); } - VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {}, + VPInstruction *createPtrAdd(VPValue *Ptr, VPValue *Offset, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, GEPNoWrapFlags::none(), DL, Name)); } - VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, DebugLoc DL = {}, + VPValue *createInBoundsPtrAdd(VPValue *Ptr, VPValue *Offset, + DebugLoc DL = DebugLoc::getUnknown(), const Twine &Name = "") { return tryInsertInstruction( new VPInstruction(VPInstruction::PtrAdd, {Ptr, Offset}, diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 427c1460fcfc9..2a237f42e4042 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -772,7 +772,7 @@ class EpilogueVectorizerEpilogueLoop : public InnerLoopAndEpilogueVectorizer { /// Look for a meaningful debug location on the instruction or its operands. static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) { if (!I) - return DebugLoc(); + return DebugLoc::getUnknown(); DebugLoc Empty; if (I->getDebugLoc() != Empty) @@ -1881,13 +1881,15 @@ class GeneratedRTChecks { if (SCEVCheckBlock) { SCEVCheckBlock->getTerminator()->moveBefore( Preheader->getTerminator()->getIterator()); - new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); + auto *UI = new UnreachableInst(Preheader->getContext(), SCEVCheckBlock); + UI->setDebugLoc(DebugLoc::getTemporary()); Preheader->getTerminator()->eraseFromParent(); } if (MemCheckBlock) { MemCheckBlock->getTerminator()->moveBefore( Preheader->getTerminator()->getIterator()); - new UnreachableInst(Preheader->getContext(), MemCheckBlock); + auto *UI = new UnreachableInst(Preheader->getContext(), MemCheckBlock); + UI->setDebugLoc(DebugLoc::getTemporary()); Preheader->getTerminator()->eraseFromParent(); } diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp index ec40124c57a6a..c3ca22dce0cc4 100644 --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -17434,6 +17434,12 @@ static Instruction *propagateMetadata(Instruction *Inst, ArrayRef VL) { return llvm::propagateMetadata(Inst, Insts); } +static DebugLoc getDebugLocFromPHI(PHINode &PN) { + if (DebugLoc DL = PN.getDebugLoc()) + return DL; + return DebugLoc::getUnknown(); +} + Value *BoUpSLP::vectorizeTree(TreeEntry *E) { IRBuilderBase::InsertPointGuard Guard(Builder); @@ -17599,14 +17605,14 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { auto *PH = cast(VL0); Builder.SetInsertPoint(PH->getParent(), PH->getParent()->getFirstNonPHIIt()); - Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH)); PHINode *NewPhi = Builder.CreatePHI(VecTy, PH->getNumIncomingValues()); Value *V = NewPhi; // Adjust insertion point once all PHI's have been generated. Builder.SetInsertPoint(PH->getParent(), PH->getParent()->getFirstInsertionPt()); - Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH)); V = FinalShuffle(V, E); @@ -17638,7 +17644,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) { } Builder.SetInsertPoint(IBB->getTerminator()); - Builder.SetCurrentDebugLocation(PH->getDebugLoc()); + Builder.SetCurrentDebugLocation(getDebugLocFromPHI(*PH)); Value *Vec = vectorizeOperand(E, I); if (VecTy != Vec->getType()) { assert((It != MinBWs.end() || getOperandEntry(E, I)->isGather() || diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index bbcbfee4e471b..acc861b991975 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -1816,9 +1816,9 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, class VPHeaderPHIRecipe : public VPSingleDefRecipe, public VPPhiAccessors { protected: VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr, - VPValue *Start, DebugLoc DL = {}) - : VPSingleDefRecipe(VPDefID, ArrayRef({Start}), UnderlyingInstr, DL) { - } + VPValue *Start, DebugLoc DL = DebugLoc::getUnknown()) + : VPSingleDefRecipe(VPDefID, ArrayRef({Start}), + UnderlyingInstr, DL) {} const VPRecipeBase *getAsRecipe() const override { return this; } From 117e78fe5012087c1ee535b91936bf4d8e3c7785 Mon Sep 17 00:00:00 2001 From: William <113542065+saturn691@users.noreply.github.com> Date: Wed, 11 Jun 2025 17:51:34 +0100 Subject: [PATCH 108/851] [libc] Add NULL macro definitions to header files (#142764) By the C standard, , , , , , and require NULL to be defined. --- libc/include/CMakeLists.txt | 5 +++++ libc/include/locale.yaml | 3 +++ libc/include/stdio.yaml | 2 ++ libc/include/stdlib.yaml | 4 +++- libc/include/string.h.def | 2 -- libc/include/string.yaml | 4 +++- libc/include/time.yaml | 4 +++- libc/include/wchar.yaml | 4 +++- 8 files changed, 22 insertions(+), 6 deletions(-) diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt index 7209e10c68b8f..55268d19529c7 100644 --- a/libc/include/CMakeLists.txt +++ b/libc/include/CMakeLists.txt @@ -255,6 +255,7 @@ add_header_macro( time.h DEPENDS .llvm_libc_common_h + .llvm-libc-macros.null_macro .llvm-libc-macros.time_macros .llvm-libc-types.clock_t .llvm-libc-types.time_t @@ -329,6 +330,7 @@ add_header_macro( stdio.h DEPENDS .llvm-libc-macros.file_seek_macros + .llvm-libc-macros.null_macro .llvm-libc-macros.stdio_macros .llvm-libc-types.FILE .llvm-libc-types.cookie_io_functions_t @@ -343,6 +345,7 @@ add_header_macro( ../libc/include/stdlib.yaml stdlib.h DEPENDS + .llvm-libc-macros.null_macro .llvm-libc-macros.stdlib_macros .llvm-libc-types.__atexithandler_t .llvm-libc-types.__qsortcompare_t @@ -709,6 +712,7 @@ add_header_macro( wchar.h DEPENDS .llvm_libc_common_h + .llvm-libc-macros.null_macro .llvm-libc-macros.wchar_macros .llvm-libc-types.mbstate_t .llvm-libc-types.size_t @@ -723,6 +727,7 @@ add_header_macro( DEPENDS .llvm_libc_common_h .llvm-libc-macros.locale_macros + .llvm-libc-macros.null_macro .llvm-libc-types.locale_t .llvm-libc-types.struct_lconv ) diff --git a/libc/include/locale.yaml b/libc/include/locale.yaml index 6c71b70e59f0b..4566984ad83af 100644 --- a/libc/include/locale.yaml +++ b/libc/include/locale.yaml @@ -1,5 +1,8 @@ header: locale.h header_template: locale.h.def +macros: + - macro_name: NULL + macro_header: null-macro.h types: - type_name: locale_t - type_name: struct_lconv diff --git a/libc/include/stdio.yaml b/libc/include/stdio.yaml index 2619984cca264..3d5164fa10ffb 100644 --- a/libc/include/stdio.yaml +++ b/libc/include/stdio.yaml @@ -1,6 +1,8 @@ header: stdio.h header_template: stdio.h.def macros: + - macro_name: NULL + macro_header: null-macro.h - macro_name: stdout macro_value: stdout - macro_name: stdin diff --git a/libc/include/stdlib.yaml b/libc/include/stdlib.yaml index f7155ba27a162..3b2ff13c684b1 100644 --- a/libc/include/stdlib.yaml +++ b/libc/include/stdlib.yaml @@ -4,7 +4,9 @@ standards: - stdc merge_yaml_files: - stdlib-malloc.yaml -macros: [] +macros: + - macro_name: NULL + macro_header: null-macro.h types: - type_name: __atexithandler_t - type_name: __qsortcompare_t diff --git a/libc/include/string.h.def b/libc/include/string.h.def index 1bd2687db2bea..339d005e43a4f 100644 --- a/libc/include/string.h.def +++ b/libc/include/string.h.def @@ -11,8 +11,6 @@ #include "__llvm-libc-common.h" -#include "llvm-libc-macros/null-macro.h" - %%public_api() #endif // LLVM_LIBC_STRING_H diff --git a/libc/include/string.yaml b/libc/include/string.yaml index 9f72b8db6c1eb..736deceb453de 100644 --- a/libc/include/string.yaml +++ b/libc/include/string.yaml @@ -1,6 +1,8 @@ header: string.h header_template: string.h.def -macros: [] +macros: + - macro_name: NULL + macro_header: null-macro.h types: - type_name: locale_t - type_name: size_t diff --git a/libc/include/time.yaml b/libc/include/time.yaml index 7bb25dbe85ac4..3b9d77c0aaae2 100644 --- a/libc/include/time.yaml +++ b/libc/include/time.yaml @@ -1,6 +1,8 @@ header: time.h header_template: time.h.def -macros: [] +macros: + - macro_name: NULL + macro_header: null-macro.h types: - type_name: struct_timeval - type_name: clockid_t diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml index 877be48b6a10f..57f4f6660827e 100644 --- a/libc/include/wchar.yaml +++ b/libc/include/wchar.yaml @@ -1,6 +1,8 @@ header: wchar.h header_template: wchar.h.def -macros: [] +macros: + - macro_name: NULL + macro_header: null-macro.h types: - type_name: size_t - type_name: wint_t From 469922f7c40a1733fba98e29fa2bd09a9565ddd6 Mon Sep 17 00:00:00 2001 From: Sami Tolvanen Date: Wed, 11 Jun 2025 16:57:23 +0000 Subject: [PATCH 109/851] [X86] Don't emit ENDBR for asm goto branch targets (#143439) Similarly to #141562, which disabled BTI generation for ARM asm goto branch targets, drop unnecessary ENDBRs from IsInlineAsmBrIndirectTarget machine basic blocks. --- .../Target/X86/X86IndirectBranchTracking.cpp | 2 +- llvm/test/CodeGen/X86/callbr-asm-endbr.ll | 34 +++++++++++++++++++ 2 files changed, 35 insertions(+), 1 deletion(-) create mode 100644 llvm/test/CodeGen/X86/callbr-asm-endbr.ll diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp index 7740a174af4f3..52be14228e555 100644 --- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp +++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp @@ -147,7 +147,7 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) { for (auto &MBB : MF) { // Find all basic blocks that their address was taken (for example // in the case of indirect jump) and add ENDBR instruction. - if (MBB.hasAddressTaken()) + if (MBB.isMachineBlockAddressTaken() || MBB.isIRBlockAddressTaken()) Changed |= addENDBR(MBB, MBB.begin()); for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) { diff --git a/llvm/test/CodeGen/X86/callbr-asm-endbr.ll b/llvm/test/CodeGen/X86/callbr-asm-endbr.ll new file mode 100644 index 0000000000000..133de89d5f3a1 --- /dev/null +++ b/llvm/test/CodeGen/X86/callbr-asm-endbr.ll @@ -0,0 +1,34 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +define i32 @test1(i32 %a) { +; CHECK-LABEL: test1: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: endbr64 +; CHECK-NEXT: addl $4, %edi +; CHECK-NEXT: #APP +; CHECK-NEXT: xorl %edi, %edi +; CHECK-NEXT: jmp .LBB0_2 +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: # %bb.1: # %normal +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_2: # Inline asm indirect target +; CHECK-NEXT: # %fail +; CHECK-NEXT: # Label of block must be emitted +; CHECK-NEXT: movl $1, %eax +; CHECK-NEXT: retq +entry: + %0 = add i32 %a, 4 + callbr void asm "xorl $0, $0; jmp ${1:l}", "r,!i,~{dirflag},~{fpsr},~{flags}"(i32 %0) to label %normal [label %fail] + +normal: + ret i32 0 + +fail: + ret i32 1 +} + +!llvm.module.flags = !{!0} + +!0 = !{i32 8, !"cf-protection-branch", i32 1} From 145b1b0f103e61cfc8a47ed37080e955630a1390 Mon Sep 17 00:00:00 2001 From: Felipe de Azevedo Piovezan Date: Wed, 11 Jun 2025 09:57:42 -0700 Subject: [PATCH 110/851] [lldb][nfc] Factor out code checking if Variable is in scope (#143572) This is useful for checking whether a variable is in scope inside a specific block. --- lldb/include/lldb/Symbol/Variable.h | 3 ++ lldb/source/Symbol/Variable.cpp | 46 +++++++++++++++-------------- 2 files changed, 27 insertions(+), 22 deletions(-) diff --git a/lldb/include/lldb/Symbol/Variable.h b/lldb/include/lldb/Symbol/Variable.h index c437624d1ea6d..5b9c709c8b867 100644 --- a/lldb/include/lldb/Symbol/Variable.h +++ b/lldb/include/lldb/Symbol/Variable.h @@ -89,6 +89,9 @@ class Variable : public UserID, public std::enable_shared_from_this { bool IsInScope(StackFrame *frame); + /// Returns true if this variable is in scope at `addr` inside `block`. + bool IsInScope(const Block &block, const Address &addr); + bool LocationIsValidForFrame(StackFrame *frame); bool LocationIsValidForAddress(const Address &address); diff --git a/lldb/source/Symbol/Variable.cpp b/lldb/source/Symbol/Variable.cpp index 8244725aba545..af32e0e958e51 100644 --- a/lldb/source/Symbol/Variable.cpp +++ b/lldb/source/Symbol/Variable.cpp @@ -290,28 +290,9 @@ bool Variable::IsInScope(StackFrame *frame) { // this variable was defined in is currently Block *deepest_frame_block = frame->GetSymbolContext(eSymbolContextBlock).block; - if (deepest_frame_block) { - SymbolContext variable_sc; - CalculateSymbolContext(&variable_sc); - - // Check for static or global variable defined at the compile unit - // level that wasn't defined in a block - if (variable_sc.block == nullptr) - return true; - - // Check if the variable is valid in the current block - if (variable_sc.block != deepest_frame_block && - !variable_sc.block->Contains(deepest_frame_block)) - return false; - - // If no scope range is specified then it means that the scope is the - // same as the scope of the enclosing lexical block. - if (m_scope_range.IsEmpty()) - return true; - - addr_t file_address = frame->GetFrameCodeAddress().GetFileAddress(); - return m_scope_range.FindEntryThatContains(file_address) != nullptr; - } + Address frame_addr = frame->GetFrameCodeAddress(); + if (deepest_frame_block) + return IsInScope(*deepest_frame_block, frame_addr); } break; @@ -321,6 +302,27 @@ bool Variable::IsInScope(StackFrame *frame) { return false; } +bool Variable::IsInScope(const Block &block, const Address &addr) { + SymbolContext variable_sc; + CalculateSymbolContext(&variable_sc); + + // Check for static or global variable defined at the compile unit + // level that wasn't defined in a block + if (variable_sc.block == nullptr) + return true; + + // Check if the variable is valid in the current block + if (variable_sc.block != &block && !variable_sc.block->Contains(&block)) + return false; + + // If no scope range is specified then it means that the scope is the + // same as the scope of the enclosing lexical block. + if (m_scope_range.IsEmpty()) + return true; + + return m_scope_range.FindEntryThatContains(addr.GetFileAddress()) != nullptr; +} + Status Variable::GetValuesForVariableExpressionPath( llvm::StringRef variable_expr_path, ExecutionContextScope *scope, GetVariableCallback callback, void *baton, VariableList &variable_list, From 370e54d03a5bb11f3f283ad5ab479501c74069c7 Mon Sep 17 00:00:00 2001 From: Amr Hesham Date: Wed, 11 Jun 2025 19:02:36 +0200 Subject: [PATCH 111/851] [CIR] Upstream splat op for VectorType (#139827) This change adds support for splat op for VectorType Issue https://github.com/llvm/llvm-project/issues/136487 --- clang/include/clang/CIR/Dialect/IR/CIROps.td | 32 ++++++++++ clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp | 8 +++ .../CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp | 51 +++++++++++++++ .../CIR/Lowering/DirectToLLVM/LowerToLLVM.h | 10 +++ clang/test/CIR/CodeGen/vector-ext.cpp | 64 +++++++++++++++++++ clang/test/CIR/CodeGen/vector.cpp | 63 ++++++++++++++++++ clang/test/CIR/IR/vector.cir | 33 ++++++++++ 7 files changed, 261 insertions(+) diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td index 565c0676773e6..634f0dd554c77 100644 --- a/clang/include/clang/CIR/Dialect/IR/CIROps.td +++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td @@ -2277,6 +2277,38 @@ def VecTernaryOp : CIR_Op<"vec.ternary", let hasFolder = 1; } +//===----------------------------------------------------------------------===// +// VecSplatOp +//===----------------------------------------------------------------------===// + +def VecSplatOp : CIR_Op<"vec.splat", [Pure, + TypesMatchWith<"type of 'value' matches element type of 'result'", "result", + "value", "cast($_self).getElementType()">]> { + + let summary = "Convert a scalar into a vector"; + let description = [{ + The `cir.vec.splat` operation creates a vector value from a scalar value. + All elements of the vector have the same value, that of the given scalar. + + It's a separate operation from `cir.vec.create` because more + efficient LLVM IR can be generated for it, and because some optimization and + analysis passes can benefit from knowing that all elements of the vector + have the same value. + + ```mlir + %value = cir.const #cir.int<3> : !s32i + %value_vec = cir.vec.splat %value : !s32i, !cir.vector<4 x !s32i> + ``` + }]; + + let arguments = (ins CIR_VectorElementType:$value); + let results = (outs CIR_VectorType:$result); + + let assemblyFormat = [{ + $value `:` type($value) `,` qualified(type($result)) attr-dict + }]; +} + //===----------------------------------------------------------------------===// // BaseClassAddrOp //===----------------------------------------------------------------------===// diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp index 481eb492d1875..30d231e2c61de 100644 --- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp @@ -1780,6 +1780,14 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) { cgf.convertType(destTy)); } + case CK_VectorSplat: { + // Create a vector object and fill all elements with the same scalar value. + assert(destTy->isVectorType() && "CK_VectorSplat to non-vector type"); + return builder.create( + cgf.getLoc(subExpr->getSourceRange()), cgf.convertType(destTy), + Visit(subExpr)); + } + default: cgf.getCIRGenModule().errorNYI(subExpr->getSourceRange(), "CastExpr: ", ce->getCastKindName()); diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp index 4fdf8f9ec2695..1642d10d427b5 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp @@ -1803,6 +1803,7 @@ void ConvertCIRToLLVMPass::runOnOperation() { CIRToLLVMVecExtractOpLowering, CIRToLLVMVecInsertOpLowering, CIRToLLVMVecCmpOpLowering, + CIRToLLVMVecSplatOpLowering, CIRToLLVMVecShuffleOpLowering, CIRToLLVMVecShuffleDynamicOpLowering, CIRToLLVMVecTernaryOpLowering @@ -1956,6 +1957,56 @@ mlir::LogicalResult CIRToLLVMVecCmpOpLowering::matchAndRewrite( return mlir::success(); } +mlir::LogicalResult CIRToLLVMVecSplatOpLowering::matchAndRewrite( + cir::VecSplatOp op, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const { + // Vector splat can be implemented with an `insertelement` and a + // `shufflevector`, which is better than an `insertelement` for each + // element in the vector. Start with an undef vector. Insert the value into + // the first element. Then use a `shufflevector` with a mask of all 0 to + // fill out the entire vector with that value. + cir::VectorType vecTy = op.getType(); + mlir::Type llvmTy = typeConverter->convertType(vecTy); + mlir::Location loc = op.getLoc(); + mlir::Value poison = rewriter.create(loc, llvmTy); + + mlir::Value elementValue = adaptor.getValue(); + if (mlir::isa(elementValue.getDefiningOp())) { + // If the splat value is poison, then we can just use poison value + // for the entire vector. + rewriter.replaceOp(op, poison); + return mlir::success(); + } + + if (auto constValue = + dyn_cast(elementValue.getDefiningOp())) { + if (auto intAttr = dyn_cast(constValue.getValue())) { + mlir::DenseIntElementsAttr denseVec = mlir::DenseIntElementsAttr::get( + mlir::cast(llvmTy), intAttr.getValue()); + rewriter.replaceOpWithNewOp( + op, denseVec.getType(), denseVec); + return mlir::success(); + } + + if (auto fpAttr = dyn_cast(constValue.getValue())) { + mlir::DenseFPElementsAttr denseVec = mlir::DenseFPElementsAttr::get( + mlir::cast(llvmTy), fpAttr.getValue()); + rewriter.replaceOpWithNewOp( + op, denseVec.getType(), denseVec); + return mlir::success(); + } + } + + mlir::Value indexValue = + rewriter.create(loc, rewriter.getI64Type(), 0); + mlir::Value oneElement = rewriter.create( + loc, poison, elementValue, indexValue); + SmallVector zeroValues(vecTy.getSize(), 0); + rewriter.replaceOpWithNewOp(op, oneElement, + poison, zeroValues); + return mlir::success(); +} + mlir::LogicalResult CIRToLLVMVecShuffleOpLowering::matchAndRewrite( cir::VecShuffleOp op, OpAdaptor adaptor, mlir::ConversionPatternRewriter &rewriter) const { diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h index 22d8a1e7c22e0..2eda568c84bdb 100644 --- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h +++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.h @@ -367,6 +367,16 @@ class CIRToLLVMVecCmpOpLowering mlir::ConversionPatternRewriter &) const override; }; +class CIRToLLVMVecSplatOpLowering + : public mlir::OpConversionPattern { +public: + using mlir::OpConversionPattern::OpConversionPattern; + + mlir::LogicalResult + matchAndRewrite(cir::VecSplatOp op, OpAdaptor, + mlir::ConversionPatternRewriter &) const override; +}; + class CIRToLLVMVecShuffleOpLowering : public mlir::OpConversionPattern { public: diff --git a/clang/test/CIR/CodeGen/vector-ext.cpp b/clang/test/CIR/CodeGen/vector-ext.cpp index e1814f216f6b9..965c44c9461a8 100644 --- a/clang/test/CIR/CodeGen/vector-ext.cpp +++ b/clang/test/CIR/CodeGen/vector-ext.cpp @@ -990,6 +990,7 @@ void foo14() { // OGCG: %[[TMP_B:.*]] = load <4 x float>, ptr %[[VEC_B]], align 16 // OGCG: %[[GE:.*]] = fcmp oge <4 x float> %[[TMP_A]], %[[TMP_B]] // OGCG: %[[RES:.*]] = sext <4 x i1> %[[GE]] to <4 x i32> +// OGCG: store <4 x i32> %[[RES]], ptr {{.*}}, align 16 void foo15() { vi4 a; @@ -1092,6 +1093,69 @@ void foo17() { // OGCG: %[[TMP:.*]] = load <2 x double>, ptr %[[VEC_A]], align 16 // OGCG: %[[RES:.*]]= fptoui <2 x double> %[[TMP]] to <2 x i16> +void foo18() { + vi4 a = {1, 2, 3, 4}; + vi4 shl = a << 3; + + uvi4 b = {1u, 2u, 3u, 4u}; + uvi4 shr = b >> 3u; +} + +// CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["a", init] +// CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["shl", init] +// CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr>, ["b", init] +// CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr>, ["shr", init] +// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i +// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i +// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i +// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i +// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] : +// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i> +// CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr> +// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr>, !cir.vector<4 x !s32i> +// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !s32i +// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !s32i, !cir.vector<4 x !s32i> +// CIR: %[[SHL:.*]] = cir.shift(left, %[[TMP_A]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i> +// CIR: cir.store{{.*}} %[[SHL]], %[[SHL_RES]] : !cir.vector<4 x !s32i>, !cir.ptr> +// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !u32i +// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !u32i +// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !u32i +// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !u32i +// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] : +// CIR-SAME: !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i> +// CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr> +// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr>, !cir.vector<4 x !u32i> +// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !u32i +// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !u32i, !cir.vector<4 x !u32i> +// CIR: %[[SHR:.*]] = cir.shift(right, %[[TMP_B]] : !cir.vector<4 x !u32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !u32i>) -> !cir.vector<4 x !u32i> +// CIR: cir.store{{.*}} %[[SHR]], %[[SHR_RES]] : !cir.vector<4 x !u32i>, !cir.ptr> + +// LLVM: %[[VEC_A:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: %[[SHL_RES:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: %[[VEC_B:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: %[[SHR_RES:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: store <4 x i32> , ptr %[[VEC_A]], align 16 +// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16 +// LLVM: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3) +// LLVM: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16 +// LLVM: store <4 x i32> , ptr %[[VEC_B]], align 16 +// LLVM: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16 +// LLVM: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3) +// LLVM: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16 + +// OGCG: %[[VEC_A:.*]] = alloca <4 x i32>, align 16 +// OGCG: %[[SHL_RES:.*]] = alloca <4 x i32>, align 16 +// OGCG: %[[VEC_B:.*]] = alloca <4 x i32>, align 16 +// OGCG: %[[SHR_RES:.*]] = alloca <4 x i32>, align 16 +// OGCG: store <4 x i32> , ptr %[[VEC_A]], align 16 +// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16 +// OGCG: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3) +// OGCG: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16 +// OGCG: store <4 x i32> , ptr %[[VEC_B]], align 16 +// OGCG: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16 +// OGCG: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3) +// OGCG: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16 + void foo19() { vi4 a; vi4 b; diff --git a/clang/test/CIR/CodeGen/vector.cpp b/clang/test/CIR/CodeGen/vector.cpp index 4f116faa7a1ac..23e91724dc0f3 100644 --- a/clang/test/CIR/CodeGen/vector.cpp +++ b/clang/test/CIR/CodeGen/vector.cpp @@ -1071,6 +1071,69 @@ void foo17() { // OGCG: %[[TMP:.*]] = load <2 x double>, ptr %[[VEC_A]], align 16 // OGCG: %[[RES:.*]]= fptoui <2 x double> %[[TMP]] to <2 x i16> +void foo18() { + vi4 a = {1, 2, 3, 4}; + vi4 shl = a << 3; + + uvi4 b = {1u, 2u, 3u, 4u}; + uvi4 shr = b >> 3u; +} + +// CIR: %[[VEC_A:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["a", init] +// CIR: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["shl", init] +// CIR: %[[VEC_B:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr>, ["b", init] +// CIR: %[[SHR_RES:.*]] = cir.alloca !cir.vector<4 x !u32i>, !cir.ptr>, ["shr", init] +// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i +// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i +// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i +// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i +// CIR: %[[VEC_A_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] : +// CIR-SAME: !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i> +// CIR: cir.store{{.*}} %[[VEC_A_VAL]], %[[VEC_A]] : !cir.vector<4 x !s32i>, !cir.ptr> +// CIR: %[[TMP_A:.*]] = cir.load{{.*}} %[[VEC_A]] : !cir.ptr>, !cir.vector<4 x !s32i> +// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !s32i +// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !s32i, !cir.vector<4 x !s32i> +// CIR: %[[SHL:.*]] = cir.shift(left, %[[TMP_A]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i> +// CIR: cir.store{{.*}} %[[SHL]], %[[SHL_RES]] : !cir.vector<4 x !s32i>, !cir.ptr> +// CIR: %[[CONST_1:.*]] = cir.const #cir.int<1> : !u32i +// CIR: %[[CONST_2:.*]] = cir.const #cir.int<2> : !u32i +// CIR: %[[CONST_3:.*]] = cir.const #cir.int<3> : !u32i +// CIR: %[[CONST_4:.*]] = cir.const #cir.int<4> : !u32i +// CIR: %[[VEC_B_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] : +// CIR-SAME: !u32i, !u32i, !u32i, !u32i) : !cir.vector<4 x !u32i> +// CIR: cir.store{{.*}} %[[VEC_B_VAL]], %[[VEC_B]] : !cir.vector<4 x !u32i>, !cir.ptr> +// CIR: %[[TMP_B:.*]] = cir.load{{.*}} %[[VEC_B]] : !cir.ptr>, !cir.vector<4 x !u32i> +// CIR: %[[SH_AMOUNT:.*]] = cir.const #cir.int<3> : !u32i +// CIR: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SH_AMOUNT]] : !u32i, !cir.vector<4 x !u32i> +// CIR: %[[SHR:.*]] = cir.shift(right, %[[TMP_B]] : !cir.vector<4 x !u32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !u32i>) -> !cir.vector<4 x !u32i> +// CIR: cir.store{{.*}} %[[SHR]], %[[SHR_RES]] : !cir.vector<4 x !u32i>, !cir.ptr> + +// LLVM: %[[VEC_A:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: %[[SHL_RES:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: %[[VEC_B:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: %[[SHR_RES:.*]] = alloca <4 x i32>, i64 1, align 16 +// LLVM: store <4 x i32> , ptr %[[VEC_A]], align 16 +// LLVM: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16 +// LLVM: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3) +// LLVM: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16 +// LLVM: store <4 x i32> , ptr %[[VEC_B]], align 16 +// LLVM: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16 +// LLVM: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3) +// LLVM: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16 + +// OGCG: %[[VEC_A:.*]] = alloca <4 x i32>, align 16 +// OGCG: %[[SHL_RES:.*]] = alloca <4 x i32>, align 16 +// OGCG: %[[VEC_B:.*]] = alloca <4 x i32>, align 16 +// OGCG: %[[SHR_RES:.*]] = alloca <4 x i32>, align 16 +// OGCG: store <4 x i32> , ptr %[[VEC_A]], align 16 +// OGCG: %[[TMP_A:.*]] = load <4 x i32>, ptr %[[VEC_A]], align 16 +// OGCG: %[[SHL:.*]] = shl <4 x i32> %[[TMP_A]], splat (i32 3) +// OGCG: store <4 x i32> %[[SHL]], ptr %[[SHL_RES]], align 16 +// OGCG: store <4 x i32> , ptr %[[VEC_B]], align 16 +// OGCG: %[[TMP_B:.*]] = load <4 x i32>, ptr %[[VEC_B]], align 16 +// OGCG: %[[SHR:.*]] = lshr <4 x i32> %[[TMP_B]], splat (i32 3) +// OGCG: store <4 x i32> %[[SHR]], ptr %[[SHR_RES]], align 16 + void foo19() { vi4 a; vi4 b; diff --git a/clang/test/CIR/IR/vector.cir b/clang/test/CIR/IR/vector.cir index a455acf92ab6f..f23f5de9692de 100644 --- a/clang/test/CIR/IR/vector.cir +++ b/clang/test/CIR/IR/vector.cir @@ -187,4 +187,37 @@ cir.func @vector_shuffle_dynamic_test() { // CHECK: cir.return // CHECK: } +cir.func @vector_splat_test() { + %0 = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["a", init] + %1 = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["shl", init] + %2 = cir.const #cir.int<1> : !s32i + %3 = cir.const #cir.int<2> : !s32i + %4 = cir.const #cir.int<3> : !s32i + %5 = cir.const #cir.int<4> : !s32i + %6 = cir.vec.create(%2, %3, %4, %5 : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i> + cir.store %6, %0 : !cir.vector<4 x !s32i>, !cir.ptr> + %7 = cir.load %0 : !cir.ptr>, !cir.vector<4 x !s32i> + %8 = cir.const #cir.int<3> : !s32i + %9 = cir.vec.splat %8 : !s32i, !cir.vector<4 x !s32i> + %10 = cir.shift(left, %7 : !cir.vector<4 x !s32i>, %9 : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i> + cir.store %10, %1 : !cir.vector<4 x !s32i>, !cir.ptr> + cir.return +} + +// CHECK: cir.func @vector_splat_test() { +// CHECK-NEXT: %[[VEC:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["a", init] +// CHECK-NEXT: %[[SHL_RES:.*]] = cir.alloca !cir.vector<4 x !s32i>, !cir.ptr>, ["shl", init] +// CHECK-NEXT: %[[CONST_1:.*]] = cir.const #cir.int<1> : !s32i +// CHECK-NEXT: %[[CONST_2:.*]] = cir.const #cir.int<2> : !s32i +// CHECK-NEXT: %[[CONST_3:.*]] = cir.const #cir.int<3> : !s32i +// CHECK-NEXT: %[[CONST_4:.*]] = cir.const #cir.int<4> : !s32i +// CHECK-NEXT: %[[VEC_VAL:.*]] = cir.vec.create(%[[CONST_1]], %[[CONST_2]], %[[CONST_3]], %[[CONST_4]] : !s32i, !s32i, !s32i, !s32i) : !cir.vector<4 x !s32i> +// CHECK-NEXT: cir.store %[[VEC_VAL]], %[[VEC]] : !cir.vector<4 x !s32i>, !cir.ptr> +// CHECK-NEXT: %[[TMP:.*]] = cir.load %[[VEC]] : !cir.ptr>, !cir.vector<4 x !s32i> +// CHECK-NEXT: %[[SPLAT_VAL:.*]] = cir.const #cir.int<3> : !s32i +// CHECK-NEXT: %[[SPLAT_VEC:.*]] = cir.vec.splat %[[SPLAT_VAL]] : !s32i, !cir.vector<4 x !s32i> +// CHECK-NEXT: %[[SHL:.*]] = cir.shift(left, %[[TMP]] : !cir.vector<4 x !s32i>, %[[SPLAT_VEC]] : !cir.vector<4 x !s32i>) -> !cir.vector<4 x !s32i> +// CHECK-NEXT: cir.store %[[SHL]], %[[SHL_RES:.*]] : !cir.vector<4 x !s32i>, !cir.ptr> +// CHECK-NEXT: cir.return + } From 621a7d0f66f3da27e687dd7dd832450334ee81da Mon Sep 17 00:00:00 2001 From: jeanPerier Date: Wed, 11 Jun 2025 19:02:47 +0200 Subject: [PATCH 112/851] [flang] silence bogus error with BIND(C) variable in hermetic module (#143737) The global name semantic check was firing in a bogus way when BIND(C) variables are in hermetic module. Do not raise the error if one of the symbol with the conflicting global name is an "hermetic variant" of the other. --- flang/lib/Semantics/check-declarations.cpp | 10 +++++++++ flang/test/Semantics/modfile76.F90 | 24 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 flang/test/Semantics/modfile76.F90 diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp index 46a5b970fdf0c..f9d64485f1407 100644 --- a/flang/lib/Semantics/check-declarations.cpp +++ b/flang/lib/Semantics/check-declarations.cpp @@ -2958,6 +2958,14 @@ static std::optional DefinesGlobalName(const Symbol &symbol) { return std::nullopt; } +static bool IsSameSymbolFromHermeticModule( + const Symbol &symbol, const Symbol &other) { + return symbol.name() == other.name() && symbol.owner().IsModule() && + other.owner().IsModule() && symbol.owner() != other.owner() && + symbol.owner().GetName() && + symbol.owner().GetName() == other.owner().GetName(); +} + // 19.2 p2 void CheckHelper::CheckGlobalName(const Symbol &symbol) { if (auto global{DefinesGlobalName(symbol)}) { @@ -2975,6 +2983,8 @@ void CheckHelper::CheckGlobalName(const Symbol &symbol) { (!IsExternalProcedureDefinition(symbol) || !IsExternalProcedureDefinition(other))) { // both are procedures/BLOCK DATA, not both definitions + } else if (IsSameSymbolFromHermeticModule(symbol, other)) { + // Both symbols are the same thing. } else if (symbol.has()) { Warn(common::LanguageFeature::BenignNameClash, symbol.name(), "Module '%s' conflicts with a global name"_port_en_US, diff --git a/flang/test/Semantics/modfile76.F90 b/flang/test/Semantics/modfile76.F90 new file mode 100644 index 0000000000000..50ee9a088e119 --- /dev/null +++ b/flang/test/Semantics/modfile76.F90 @@ -0,0 +1,24 @@ +!RUN: %flang_fc1 -fsyntax-only -fhermetic-module-files -DSTEP=1 %s +!RUN: %flang_fc1 -fsyntax-only %s + +! Tests that a BIND(C) variable in a module A captured in a hermetic module +! file USE'd in a module B is not creating bogus complaints about BIND(C) name +! conflict when both module A and B are later accessed. + +#if STEP == 1 +module modfile75a + integer, bind(c) :: x +end + +module modfile75b + use modfile75a ! capture hermetically +end + +#else +subroutine test + use modfile75a + use modfile75b + implicit none + print *, x +end subroutine +#endif From 7414d88b5f8af1bdf8da6bf2493b485ba5d079f2 Mon Sep 17 00:00:00 2001 From: Jeremy Morse Date: Wed, 11 Jun 2025 18:13:56 +0100 Subject: [PATCH 113/851] Squelch an unused-function warning After removing some debug-intrinsic creation code, this function is now unused (and un-necessary) --- llvm/lib/IR/DIBuilder.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index 1484c549dd580..c56dd7a1d3820 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -1069,10 +1069,6 @@ static Value *getDbgIntrinsicValueImpl(LLVMContext &VMContext, Value *V) { return MetadataAsValue::get(VMContext, ValueAsMetadata::get(V)); } -static Function *getDeclareIntrin(Module &M) { - return Intrinsic::getOrInsertDeclaration(&M, Intrinsic::dbg_declare); -} - DbgInstPtr DIBuilder::insertDbgValueIntrinsic(llvm::Value *Val, DILocalVariable *VarInfo, DIExpression *Expr, From 3e24dadee0d7ecc5f95fe0760afb7abdeb9a2dc5 Mon Sep 17 00:00:00 2001 From: Shafik Yaghmour Date: Wed, 11 Jun 2025 10:24:19 -0700 Subject: [PATCH 114/851] [Clang][Tooling][NFC] Use move to avoid copies of large objects (#143603) Static analysis flagged these cases in which can use std::move and avoid copies of large objects. --- clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp index 44a270d5f7b35..b1495163ccc24 100644 --- a/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp +++ b/clang/lib/Tooling/DependencyScanning/ModuleDepCollector.cpp @@ -657,7 +657,7 @@ void ModuleDepCollectorPP::moduleImport(SourceLocation ImportLoc, P1689ModuleInfo RequiredModule; RequiredModule.ModuleName = Path[0].getIdentifierInfo()->getName().str(); RequiredModule.Type = P1689ModuleInfo::ModuleType::NamedCXXModule; - MDC.RequiredStdCXXModules.push_back(RequiredModule); + MDC.RequiredStdCXXModules.push_back(std::move(RequiredModule)); return; } @@ -920,7 +920,7 @@ void ModuleDepCollectorPP::addAllSubmoduleDeps( void ModuleDepCollectorPP::addOneModuleDep(const Module *M, const ModuleID ID, ModuleDeps &MD) { - MD.ClangModuleDeps.push_back(ID); + MD.ClangModuleDeps.push_back(std::move(ID)); if (MD.IsInStableDirectories) MD.IsInStableDirectories = MDC.ModularDeps[M]->IsInStableDirectories; } From 66f533e7e34d6f6d0e293a67dd54be9e4c240ddd Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 10:39:02 -0700 Subject: [PATCH 115/851] [IR] Fix warnings (#143752) This patch fixes: llvm/lib/IR/DIBuilder.cpp:1072:18: error: unused function 'getDeclareIntrin' [-Werror,-Wunused-function] llvm/include/llvm/IR/DIBuilder.h:51:15: error: private field 'DeclareFn' is not used [-Werror,-Wunused-private-field] llvm/include/llvm/IR/DIBuilder.h:52:15: error: private field 'ValueFn' is not used [-Werror,-Wunused-private-field] llvm/include/llvm/IR/DIBuilder.h:53:15: error: private field 'LabelFn' is not used [-Werror,-Wunused-private-field] llvm/include/llvm/IR/DIBuilder.h:54:15: error: private field 'AssignFn' is not used [-Werror,-Wunused-private-field] --- llvm/include/llvm/IR/DIBuilder.h | 6 +----- llvm/lib/IR/DIBuilder.cpp | 3 +-- 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/llvm/include/llvm/IR/DIBuilder.h b/llvm/include/llvm/IR/DIBuilder.h index ebfe41dd59afb..43fca571ee6d5 100644 --- a/llvm/include/llvm/IR/DIBuilder.h +++ b/llvm/include/llvm/IR/DIBuilder.h @@ -47,11 +47,7 @@ namespace llvm { Module &M; LLVMContext &VMContext; - DICompileUnit *CUNode; ///< The one compile unit created by this DIBuiler. - Function *DeclareFn; ///< llvm.dbg.declare - Function *ValueFn; ///< llvm.dbg.value - Function *LabelFn; ///< llvm.dbg.label - Function *AssignFn; ///< llvm.dbg.assign + DICompileUnit *CUNode; ///< The one compile unit created by this DIBuiler. SmallVector AllEnumTypes; /// Track the RetainTypes, since they can be updated later on. diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp index c56dd7a1d3820..fd8c2d7bb5cc3 100644 --- a/llvm/lib/IR/DIBuilder.cpp +++ b/llvm/lib/IR/DIBuilder.cpp @@ -25,8 +25,7 @@ using namespace llvm; using namespace llvm::dwarf; DIBuilder::DIBuilder(Module &m, bool AllowUnresolvedNodes, DICompileUnit *CU) - : M(m), VMContext(M.getContext()), CUNode(CU), DeclareFn(nullptr), - ValueFn(nullptr), LabelFn(nullptr), AssignFn(nullptr), + : M(m), VMContext(M.getContext()), CUNode(CU), AllowUnresolvedNodes(AllowUnresolvedNodes) { if (CUNode) { if (const auto &ETs = CUNode->getEnumTypes()) From c2f0af514beb7618660cf8d145fa9e49fb78869c Mon Sep 17 00:00:00 2001 From: Alexander Richardson Date: Wed, 11 Jun 2025 10:47:17 -0700 Subject: [PATCH 116/851] [GISelValueTracking] Add test case for G_PTRTOINT While we can only reason about the index/address, the G_PTRTOINT operations returns all representation bits, so we can't assume the remaining ones are all zeroes. This behaviour was clarified as part of the discussion in https://discourse.llvm.org/t/clarifiying-the-semantics-of-ptrtoint/83987/54. The LangRef semantics of ptrtoint being a full representation bitcast were documented in https://github.com/llvm/llvm-project/pull/139349. Prior to 77c8d214131e951e3d3a07b45a7436f54988d6f3 we were incorrectly assuming known zeroes beyond the index size even if the input was completely unknown. This commit adds a test case for G_PTRTOINT which was omitted from that change. See https://github.com/llvm/llvm-project/issues/139598 Reviewed By: arsenm Pull Request: https://github.com/llvm/llvm-project/pull/139608 --- .../AMDGPU/GlobalISel/knownbits-ptrtoint.mir | 110 ++++++++++++++++++ 1 file changed, 110 insertions(+) create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir new file mode 100644 index 0000000000000..4073568fd4210 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/knownbits-ptrtoint.mir @@ -0,0 +1,110 @@ +# NOTE: Assertions have been autogenerated by utils/update_givaluetracking_test_checks.py UTC_ARGS: --version 5 +# RUN: llc -mtriple=amdgcn -passes="print" %s -filetype=null 2>&1 | FileCheck %s +## Check that we don't incorrectly assume known zeroes for and extend of a truncated ptrtoint +## Test case for https://github.com/llvm/llvm-project/issues/139598 +--- +## We should see 128 unknown bits. +name: PtrToInt +body: | + bb.0: + ; CHECK-LABEL: name: @PtrToInt + ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = COPY $vgpr3 + %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32) + %5:_(s128) = G_PTRTOINT %4(p8) +... +--- +## We should see 128 high zeroes followed by 128 unknown bits for extending ptrtoint. +name: PtrToIntExt +body: | + bb.0: + ; CHECK-LABEL: name: @PtrToIntExt + ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %5:_ KnownBits:00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:128 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = COPY $vgpr3 + %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32) + %5:_(s256) = G_PTRTOINT %4(p8) +... +--- +## We should see 48 unknown bits for truncating ptrtoint. +name: PtrToIntTrunc +body: | + bb.0: + ; CHECK-LABEL: name: @PtrToIntTrunc + ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????? SignBits:1 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = COPY $vgpr3 + %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32) + %5:_(s48) = G_PTRTOINT %4(p8) +... +--- +## This is the test for issue 139598: Truncating and then extending the +## G_PTRTOINT result was filling all bits above the index bitwidth with known +## zeroes even though the incoming value is completely unknown and G_PTRTOINT. +## is lowered to a bitwise copy. +## We should see all zero high bits with 48 unknown bits. +name: PtrToIntTruncExplicitExt +body: | + bb.0: + ; CHECK-LABEL: name: @PtrToIntTruncExplicitExt + ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %6:_ KnownBits:???????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %7:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000???????????????????????????????????????????????? SignBits:208 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = COPY $vgpr3 + %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32) + %5:_(s128) = G_PTRTOINT %4(p8) + %6:_(s48) = G_TRUNC %5(s128) + %7:_(s256) = G_ZEXT %6(s48) +... +--- +## Same test again but this time have the G_PTRTOINT do the truncation. +## We should see all zero high bits with 48 unknown bits. +name: PtrToIntTruncImplicitExt +body: | + bb.0: + ; CHECK-LABEL: name: @PtrToIntTruncImplicitExt + ; CHECK-NEXT: %0:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %1:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %2:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %3:_ KnownBits:???????????????????????????????? SignBits:1 + ; CHECK-NEXT: %4:_ KnownBits:???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %5:_ KnownBits:???????????????????????????????????????????????? SignBits:1 + ; CHECK-NEXT: %6:_ KnownBits:0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000???????????????????????????????????????????????? SignBits:208 + %0:_(s32) = COPY $vgpr0 + %1:_(s32) = COPY $vgpr1 + %2:_(s32) = COPY $vgpr2 + %3:_(s32) = COPY $vgpr3 + %4:_(p8) = G_MERGE_VALUES %0(s32), %1(s32), %2(s32), %3(s32) + %5:_(s48) = G_PTRTOINT %4(p8) + %6:_(s256) = G_ZEXT %5(s48) +... From bbe59e19b60b0efa8cc200fb3260fe572e188b26 Mon Sep 17 00:00:00 2001 From: Kewen12 Date: Wed, 11 Jun 2025 11:12:54 -0700 Subject: [PATCH 117/851] [OpenMP][Offload] Update the Logic for Configuring Auto Zero-Copy (#143638) Summary: Currently the Auto Zero-Copy is enabled by checking every initialized device to ensure that no dGPU is attached to an APU. However, an APU is designed to comprise a homogeneous set of GPUs, therefore, it should be sufficient to check any device for configuring Auto Zero-Copy. In this PR, it checks the first initialized device in the list. The changes in this PR are to clearly reflect the design and logic of enabling the feature for further improving the readibility. --- offload/libomptarget/PluginManager.cpp | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/offload/libomptarget/PluginManager.cpp b/offload/libomptarget/PluginManager.cpp index 93589960a426d..c4d99dfa9f10c 100644 --- a/offload/libomptarget/PluginManager.cpp +++ b/offload/libomptarget/PluginManager.cpp @@ -286,16 +286,16 @@ void PluginManager::registerLib(__tgt_bin_desc *Desc) { } PM->RTLsMtx.unlock(); - bool UseAutoZeroCopy = Plugins.size() > 0; + bool UseAutoZeroCopy = false; auto ExclusiveDevicesAccessor = getExclusiveDevicesAccessor(); - for (const auto &Device : *ExclusiveDevicesAccessor) - UseAutoZeroCopy &= Device->useAutoZeroCopy(); + // APUs are homogeneous set of GPUs. Check the first device for + // configuring Auto Zero-Copy. + if (ExclusiveDevicesAccessor->size() > 0) { + auto &Device = *(*ExclusiveDevicesAccessor)[0]; + UseAutoZeroCopy = Device.useAutoZeroCopy(); + } - // Auto Zero-Copy can only be currently triggered when the system is an - // homogeneous APU architecture without attached discrete GPUs. - // If all devices suggest to use it, change requirement flags to trigger - // zero-copy behavior when mapping memory. if (UseAutoZeroCopy) addRequirements(OMPX_REQ_AUTO_ZERO_COPY); From fad1972d74aead159a5e91b068cbf736e83836b5 Mon Sep 17 00:00:00 2001 From: VISHAKH PRAKASH Date: Wed, 11 Jun 2025 23:43:01 +0530 Subject: [PATCH 118/851] [SPIRV] FIX print the symbolic operand for opcode for the operation OpSpecConstantOp (#135756) Current implementation outputs opcode is an immediate but spirv-tools requires that the name of the operation without "Op" is needed for the instruction OpSpecConstantOp that is if the opcode is OpBitcast the instruction must be `%1 = OpSpecConstantOp %6 Bitcast %17` instead of `%1 = OpBitcast %6 124 %17` [refer this commit for more info](https://github.com/KhronosGroup/SPIRV-Tools/commit/0f166be68d4b6624a10d6bf312679505d391ec22) --------- Co-authored-by: Dmitry Sidorov Co-authored-by: Ebin-McW --- .../SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp | 3 +- .../Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h | 5 ++ llvm/lib/Target/SPIRV/SPIRVInstrInfo.td | 2 +- .../lib/Target/SPIRV/SPIRVSymbolicOperands.td | 90 +++++++++++++++++++ llvm/test/CodeGen/SPIRV/const-nested-vecs.ll | 4 +- .../fun-ptr-addrcast.ll | 2 +- .../opencl/basic/progvar_prog_scope_init.ll | 2 +- .../CodeGen/SPIRV/opt-gepoperator-of-gvar.ll | 2 +- .../pointers/PtrCast-in-OpSpecConstantOp.ll | 12 +-- .../CodeGen/SPIRV/pointers/global-ptrtoint.ll | 4 +- .../pointers/irtrans-added-int-const-32-64.ll | 2 +- 11 files changed, 112 insertions(+), 16 deletions(-) diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp index 342456757409a..0ed97f5b41c51 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.cpp @@ -68,7 +68,8 @@ getSymbolicOperandMnemonic(SPIRV::OperandCategory::OperandCategory Category, Category != SPIRV::OperandCategory::FunctionControlOperand && Category != SPIRV::OperandCategory::MemorySemanticsOperand && Category != SPIRV::OperandCategory::MemoryOperandOperand && - Category != SPIRV::OperandCategory::KernelProfilingInfoOperand) + Category != SPIRV::OperandCategory::KernelProfilingInfoOperand && + Category != SPIRV::OperandCategory::SpecConstantOpOperandsOperand) return "UNKNOWN"; // Value that encodes many enum values (one bit per enum value). std::string Name; diff --git a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h index 083c7f8460bf2..b8c467fef8e8e 100644 --- a/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h +++ b/llvm/lib/Target/SPIRV/MCTargetDesc/SPIRVBaseInfo.h @@ -222,6 +222,11 @@ namespace CooperativeMatrixOperands { #include "SPIRVGenTables.inc" } // namespace CooperativeMatrixOperands +namespace SpecConstantOpOperands { +#define GET_SpecConstantOpOperands_DECL +#include "SPIRVGenTables.inc" +} // namespace SpecConstantOpOperands + struct ExtendedBuiltin { StringRef Name; InstructionSet::InstructionSet Set; diff --git a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td index 338f6809a3e46..049ba0275f223 100644 --- a/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td +++ b/llvm/lib/Target/SPIRV/SPIRVInstrInfo.td @@ -245,7 +245,7 @@ def OpSpecConstantComposite: Op<51, (outs ID:$res), (ins TYPE:$type, variable_op "$res = OpSpecConstantComposite $type">; def OpSpecConstantCompositeContinuedINTEL: Op<6092, (outs), (ins variable_ops), "OpSpecConstantCompositeContinuedINTEL">; -def OpSpecConstantOp: Op<52, (outs ID:$res), (ins TYPE:$t, i32imm:$c, ID:$o, variable_ops), +def OpSpecConstantOp: Op<52, (outs ID:$res), (ins TYPE:$t, SpecConstantOpOperands:$c, ID:$o, variable_ops), "$res = OpSpecConstantOp $t $c $o">; // 3.42.8 Memory Instructions diff --git a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td index ca8a9a9997a8b..f1aae42ea2be0 100644 --- a/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td +++ b/llvm/lib/Target/SPIRV/SPIRVSymbolicOperands.td @@ -172,6 +172,7 @@ def KernelProfilingInfoOperand : OperandCategory; def OpcodeOperand : OperandCategory; def CooperativeMatrixLayoutOperand : OperandCategory; def CooperativeMatrixOperandsOperand : OperandCategory; +def SpecConstantOpOperandsOperand : OperandCategory; def MatrixMultiplyAccumulateOperandsOperand : OperandCategory; //===----------------------------------------------------------------------===// @@ -1755,6 +1756,95 @@ defm MatrixAAndBBFloat16ComponentsINTEL : CooperativeMatrixOperandsOperand<0x40, defm MatrixCBFloat16ComponentsINTEL : CooperativeMatrixOperandsOperand<0x80, [SPV_INTEL_joint_matrix], [CooperativeMatrixBFloat16ComponentTypeINTEL]>; defm MatrixResultBFloat16ComponentsINTEL : CooperativeMatrixOperandsOperand<0x100, [SPV_INTEL_joint_matrix], [CooperativeMatrixBFloat16ComponentTypeINTEL]>; +//===----------------------------------------------------------------------===// +// Multiclass used to define SpecConstant Operands enum values and at the +// same time SymbolicOperand. +//===----------------------------------------------------------------------===// + +def SpecConstantOpOperands : GenericEnum, Operand { + let FilterClass = "SpecConstantOpOperands"; + let NameField = "Name"; + let ValueField = "Value"; + let PrintMethod = !strconcat("printSymbolicOperand"); +} + +class SpecConstantOpOperands value> { + string Name = name; + bits<32> Value = value; +} + +multiclass SpecConstantOpOperandsOperand value, list reqExtensions, list reqCapabilities> { + def : SpecConstantOpOperands; + defm : SymbolicOperandWithRequirements; +} + +// Conversion +defm SConvert : SpecConstantOpOperandsOperand<114, [], []>; +defm FConvert : SpecConstantOpOperandsOperand<115, [], []>; +defm ConvertFToS : SpecConstantOpOperandsOperand<110, [], [Kernel]>; +defm ConvertSToF : SpecConstantOpOperandsOperand<111, [], [Kernel]>; +defm ConvertFToU : SpecConstantOpOperandsOperand<109, [], [Kernel]>; +defm ConvertUToF : SpecConstantOpOperandsOperand<112, [], [Kernel]>; +defm UConvert : SpecConstantOpOperandsOperand<113, [], [Kernel]>; +defm ConvertPtrToU : SpecConstantOpOperandsOperand<117, [], [Kernel]>; +defm ConvertUToPtr : SpecConstantOpOperandsOperand<120, [], [Kernel]>; +defm GenericCastToPtr : SpecConstantOpOperandsOperand<122, [], [Kernel]>; +defm PtrCastToGeneric : SpecConstantOpOperandsOperand<121, [], [Kernel]>; +defm Bitcast : SpecConstantOpOperandsOperand<124, [], []>; +defm QuantizeToF16 : SpecConstantOpOperandsOperand<116, [], [Shader]>; +// Arithmetic +defm SNegate : SpecConstantOpOperandsOperand<126, [], []>; +defm Not : SpecConstantOpOperandsOperand<200, [], []>; +defm IAdd : SpecConstantOpOperandsOperand<128, [], []>; +defm ISub : SpecConstantOpOperandsOperand<130, [], []>; +defm IMul : SpecConstantOpOperandsOperand<132, [], []>; +defm UDiv : SpecConstantOpOperandsOperand<134, [], []>; +defm SDiv : SpecConstantOpOperandsOperand<135, [], []>; +defm UMod : SpecConstantOpOperandsOperand<137, [], []>; +defm SRem : SpecConstantOpOperandsOperand<138, [], []>; +defm SMod : SpecConstantOpOperandsOperand<139, [], []>; +defm ShiftRightLogical : SpecConstantOpOperandsOperand<194, [], []>; +defm ShiftRightArithmetic : SpecConstantOpOperandsOperand<195, [], []>; +defm ShiftLeftLogical : SpecConstantOpOperandsOperand<196, [], []>; +defm BitwiseOr : SpecConstantOpOperandsOperand<197, [], []>; +defm BitwiseAnd : SpecConstantOpOperandsOperand<199, [], []>; +defm BitwiseXor : SpecConstantOpOperandsOperand<198, [], []>; +defm FNegate : SpecConstantOpOperandsOperand<127, [], [Kernel]>; +defm FAdd : SpecConstantOpOperandsOperand<129, [], [Kernel]>; +defm FSub : SpecConstantOpOperandsOperand<131, [], [Kernel]>; +defm FMul : SpecConstantOpOperandsOperand<133, [], [Kernel]>; +defm FDiv : SpecConstantOpOperandsOperand<136, [], [Kernel]>; +defm FRem : SpecConstantOpOperandsOperand<140, [], [Kernel]>; +defm FMod : SpecConstantOpOperandsOperand<141, [], [Kernel]>; +// Composite; +defm VectorShuffle : SpecConstantOpOperandsOperand<79, [], []>; +defm CompositeExtract : SpecConstantOpOperandsOperand<81, [], []>; +defm CompositeInsert : SpecConstantOpOperandsOperand<82, [], []>; +// Logical; +defm LogicalOr : SpecConstantOpOperandsOperand<166, [], []>; +defm LogicalAnd : SpecConstantOpOperandsOperand<167, [], []>; +defm LogicalNot : SpecConstantOpOperandsOperand<168, [], []>; +defm LogicalEqual : SpecConstantOpOperandsOperand<164, [], []>; +defm LogicalNotEqual : SpecConstantOpOperandsOperand<165, [], []>; +defm Select : SpecConstantOpOperandsOperand<169, [], []>; +// Comparison; +defm IEqual : SpecConstantOpOperandsOperand<170, [], []>; +defm INotEqual : SpecConstantOpOperandsOperand<171, [], []>; +defm ULessThan : SpecConstantOpOperandsOperand<176, [], []>; +defm SLessThan : SpecConstantOpOperandsOperand<177, [], []>; +defm UGreaterThan : SpecConstantOpOperandsOperand<172, [], []>; +defm SGreaterThan : SpecConstantOpOperandsOperand<173, [], []>; +defm ULessThanEqual : SpecConstantOpOperandsOperand<178, [], []>; +defm SLessThanEqual : SpecConstantOpOperandsOperand<179, [], []>; +defm UGreaterThanEqual : SpecConstantOpOperandsOperand<174, [], []>; +defm SGreaterThanEqual : SpecConstantOpOperandsOperand<175, [], []>; +// Memory +defm AccessChain : SpecConstantOpOperandsOperand<65, [], [Kernel]>; +defm InBoundsAccessChain : SpecConstantOpOperandsOperand<66, [], [Kernel]>; +defm PtrAccessChain : SpecConstantOpOperandsOperand<67, [], [Kernel]>; +defm InBoundsPtrAccessChain : SpecConstantOpOperandsOperand<70, [], [Kernel]>; +defm CooperativeMatrixLengthKHR : SpecConstantOpOperandsOperand<4460, [], []>; + //===----------------------------------------------------------------------===// // Multiclass used to define Matrix Multiply Accumulate Operands enum values and at the same time // SymbolicOperand entries with string mnemonics and capabilities. diff --git a/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll b/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll index 9234106e5fcd1..266b46e65f319 100644 --- a/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll +++ b/llvm/test/CodeGen/SPIRV/const-nested-vecs.ll @@ -25,8 +25,8 @@ ; CHECK-SPIRV-DAG: %[[#IntZero:]] = OpConstantNull %[[#IntTy]] ; CHECK-SPIRV-DAG: %[[#LongZero:]] = OpConstantNull %[[#LongTy]] ; CHECK-SPIRV64-DAG: %[[#ConstLong2:]] = OpConstant %[[#LongTy]] 2 -; CHECK-SPIRV64-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] 70 %[[#VarV2Char:]] %[[#IntZero]] %[[#ConstLong2]] -; CHECK-SPIRV32-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] 70 %[[#VarV2Char:]] %[[#IntZero]] %[[#Const2]] +; CHECK-SPIRV64-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] InBoundsPtrAccessChain %[[#VarV2Char:]] %[[#IntZero]] %[[#ConstLong2]] +; CHECK-SPIRV32-DAG: %[[#PvarInit:]] = OpSpecConstantOp %[[#PtrCharTy]] InBoundsPtrAccessChain %[[#VarV2Char:]] %[[#IntZero]] %[[#Const2]] ; CHECK-SPIRV-DAG: %[[#PtrPtrCharTy:]] = OpTypePointer CrossWorkgroup %[[#PtrCharTy]] ; CHECK-SPIRV-DAG: %[[#AVar]] = OpVariable %[[#PtrArr2V2CharTy]] CrossWorkgroup %[[#Arr2V2Char]] ; CHECK-SPIRV-DAG: %[[#PVar]] = OpVariable %[[#PtrPtrCharTy]] CrossWorkgroup %[[#PvarInit]] diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll index 8edecc1329d07..e5736b88b63a3 100644 --- a/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll +++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_INTEL_function_pointers/fun-ptr-addrcast.ll @@ -5,7 +5,7 @@ ; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32-unknown-unknown %s -o - --spirv-ext=+SPV_INTEL_function_pointers | FileCheck %s ; TODO: %if spirv-tools %{ llc -O0 -mtriple=spirv64-unknown-unknown %s -o - -filetype=obj | spirv-val %} -; CHECK-COUNT-3: %[[#]] = OpSpecConstantOp %[[#]] 121 %[[#]] +; CHECK-COUNT-3: %[[#]] = OpSpecConstantOp %[[#]] PtrCastToGeneric %[[#]] ; CHECK-COUNT-3: OpPtrCastToGeneric @G1 = addrspace(1) constant { [3 x ptr addrspace(4)] } { [3 x ptr addrspace(4)] [ptr addrspace(4) null, ptr addrspace(4) addrspacecast (ptr @foo to ptr addrspace(4)), ptr addrspace(4) addrspacecast (ptr @bar to ptr addrspace(4))] } diff --git a/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll b/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll index 9d759a1cf47d0..fbc83c7a1e045 100644 --- a/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll +++ b/llvm/test/CodeGen/SPIRV/opencl/basic/progvar_prog_scope_init.ll @@ -10,7 +10,7 @@ ; CHECK-DAG: %[[#pt2:]] = OpTypePointer CrossWorkgroup %[[#arr2]] ; CHECK-DAG: %[[#pt3:]] = OpTypePointer CrossWorkgroup %[[#pt1]] ; CHECK-DAG: %[[#a_var]] = OpVariable %[[#pt2]] CrossWorkgroup -; CHECK-DAG: %[[#const:]] = OpSpecConstantOp %[[#pt1]] 70 %[[#a_var]] +; CHECK-DAG: %[[#const:]] = OpSpecConstantOp %[[#pt1]] InBoundsPtrAccessChain %[[#a_var]] ; CHECK-DAG: %[[#p_var]] = OpVariable %[[#pt3]] CrossWorkgroup %[[#const]] @var = addrspace(1) global i8 0, align 1 @g_var = addrspace(1) global i8 1, align 1 diff --git a/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll b/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll index 5f9229f5a5bd6..447dfa701b659 100644 --- a/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll +++ b/llvm/test/CodeGen/SPIRV/opt-gepoperator-of-gvar.ll @@ -14,7 +14,7 @@ ; CHECK-DAG: %[[#PtrStruct:]] = OpTypePointer CrossWorkgroup %[[#Struct]] ; CHECK-DAG: %[[#Var:]] = OpVariable %[[#PtrStruct]] CrossWorkgroup %[[#VarInit]] ; CHECK-DAG: %[[#Bytes:]] = OpVariable %[[#PtrChar]] CrossWorkgroup %[[#]] -; CHECK-DAG: %[[#BytesGEP:]] = OpSpecConstantOp %[[#PtrChar]] 70 %[[#Bytes]] %[[#C648]] +; CHECK-DAG: %[[#BytesGEP:]] = OpSpecConstantOp %[[#PtrChar]] InBoundsPtrAccessChain %[[#Bytes]] %[[#C648]] ; CHECK: OpFunction ; CHECK: %[[#]] = OpFunctionParameter %[[#]] diff --git a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll index 55d638f80cc55..ca7ca06fbdc8c 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/PtrCast-in-OpSpecConstantOp.ll @@ -23,20 +23,20 @@ ; CHECK-DAG: %[[WPtr:.*]] = OpTypePointer Workgroup %[[Int]] ; CHECK-DAG: %[[F]] = OpVariable %[[CWPtr]] CrossWorkgroup %[[#]] -; CHECK-DAG: %[[GenF:.*]] = OpSpecConstantOp %[[GenPtrChar]] 121 %[[F]] +; CHECK-DAG: %[[GenF:.*]] = OpSpecConstantOp %[[GenPtrChar]] PtrCastToGeneric %[[F]] ; CHECK-DAG: %[[B]] = OpVariable %[[CWPtr]] CrossWorkgroup %[[#]] -; CHECK-DAG: %[[GenB:.*]] = OpSpecConstantOp %[[GenPtrChar]] 121 %[[B]] +; CHECK-DAG: %[[GenB:.*]] = OpSpecConstantOp %[[GenPtrChar]] PtrCastToGeneric %[[B]] ; CHECK-DAG: %[[GenFB:.*]] = OpConstantComposite %[[Arr2]] %[[GenF]] %[[GenB]] ; CHECK-DAG: %[[GenBF:.*]] = OpConstantComposite %[[Arr2]] %[[GenB]] %[[GenF]] ; CHECK-DAG: %[[CG1:.*]] = OpConstantComposite %[[Struct2]] %[[GenFB]] ; CHECK-DAG: %[[CG2:.*]] = OpConstantComposite %[[Struct2]] %[[GenBF]] ; CHECK-DAG: %[[X]] = OpVariable %[[WPtr]] Workgroup %[[#]] -; CHECK-DAG: %[[GenX:.*]] = OpSpecConstantOp %[[GenPtr]] 121 %[[X]] -; CHECK-DAG: %[[CWX:.*]] = OpSpecConstantOp %[[CWPtrChar]] 122 %[[GenX]] +; CHECK-DAG: %[[GenX:.*]] = OpSpecConstantOp %[[GenPtr]] PtrCastToGeneric %[[X]] +; CHECK-DAG: %[[CWX:.*]] = OpSpecConstantOp %[[CWPtrChar]] GenericCastToPtr %[[GenX]] ; CHECK-DAG: %[[Y]] = OpVariable %[[WPtr]] Workgroup %[[#]] -; CHECK-DAG: %[[GenY:.*]] = OpSpecConstantOp %[[GenPtr]] 121 %[[Y]] -; CHECK-DAG: %[[CWY:.*]] = OpSpecConstantOp %[[CWPtrChar]] 122 %[[GenY]] +; CHECK-DAG: %[[GenY:.*]] = OpSpecConstantOp %[[GenPtr]] PtrCastToGeneric %[[Y]] +; CHECK-DAG: %[[CWY:.*]] = OpSpecConstantOp %[[CWPtrChar]] GenericCastToPtr %[[GenY]] ; CHECK-DAG: %[[CWXY:.*]] = OpConstantComposite %[[Arr1]] %[[CWX]] %[[CWY]] ; CHECK-DAG: %[[CWYX:.*]] = OpConstantComposite %[[Arr1]] %[[CWY]] %[[CWX]] ; CHECK-DAG: %[[CG3:.*]] = OpConstantComposite %[[Struct1]] %[[CWXY]] diff --git a/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll b/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll index 16c20f9067e6e..0fd2f622dc840 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/global-ptrtoint.ll @@ -11,9 +11,9 @@ ; CHECK-DAG: %[[TyStruct:.*]] = OpTypeStruct %[[TyI64]] %[[TyI64]] ; CHECK-DAG: %[[Const128:.*]] = OpConstant %[[TyI64]] 128 ; CHECK-DAG: %[[GlobalValue]] = OpVariable -; CHECK-DAG: %[[PtrToInt:.*]] = OpSpecConstantOp %[[TyI64]] 117 %[[GlobalValue]] +; CHECK-DAG: %[[PtrToInt:.*]] = OpSpecConstantOp %[[TyI64]] ConvertPtrToU %[[GlobalValue]] ; TODO: The following bitcast line looks unneeded and we may expect it to be removed in future -; CHECK-DAG: %[[UseGlobalValue:.*]] = OpSpecConstantOp %[[TyI64]] 124 %[[PtrToInt]] +; CHECK-DAG: %[[UseGlobalValue:.*]] = OpSpecConstantOp %[[TyI64]] Bitcast %[[PtrToInt]] ; CHECK-DAG: %[[ConstComposite:.*]] = OpConstantComposite %[[TyStruct]] %[[Const128]] %[[UseGlobalValue]] ; CHECK-DAG: %[[TyPtrStruct:.*]] = OpTypePointer CrossWorkgroup %[[TyStruct]] ; CHECK: OpVariable %[[TyPtrStruct]] CrossWorkgroup %[[ConstComposite]] diff --git a/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll b/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll index c2738229aa4d7..f5abcd38d0405 100644 --- a/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll +++ b/llvm/test/CodeGen/SPIRV/pointers/irtrans-added-int-const-32-64.ll @@ -12,7 +12,7 @@ ; CHECK-SPIRV64-DAG: %[[#IntTy:]] = OpTypeInt 64 0 ; CHECK-SPIRV32-DAG: %[[#IntTy:]] = OpTypeInt 32 0 ; CHECK-SPIRV-DAG: %[[#Const2:]] = OpConstant %[[#IntTy]] 2 -; CHECK-SPIRV-DAG: %[[#]] = OpSpecConstantOp %[[#]] 70 %[[#]] %[[#]] %[[#Const2]] +; CHECK-SPIRV-DAG: %[[#]] = OpSpecConstantOp %[[#]] InBoundsPtrAccessChain %[[#]] %[[#]] %[[#Const2]] ; CHECK-SPIRV: OpFunction @a_var = addrspace(1) global [2 x i8] [i8 1, i8 1] From 42c82fcc29c1c8e19b2265495a5d8f59fb5ea764 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 11 Jun 2025 20:19:26 +0200 Subject: [PATCH 119/851] [libc++] Upgrade to GCC 15 (#138293) --- .github/workflows/libcxx-build-and-test.yaml | 8 ++++---- libcxx/docs/index.rst | 2 +- libcxx/src/experimental/time_zone.cpp | 9 +++++++++ .../alg.contains/ranges.contains.pass.cpp | 4 ++-- .../equality_comparable.compile.pass.cpp | 6 ++++++ .../equality_comparable_with.compile.pass.cpp | 15 +++++++++++++++ .../totally_ordered.compile.pass.cpp | 3 +++ .../totally_ordered_with.compile.pass.cpp | 10 ++++++++++ .../new.delete.array/new.size.except.pass.cpp | 3 +++ .../new.delete/new.delete.array/new.size.pass.cpp | 3 +++ .../new.size_align.except.pass.cpp | 3 +++ .../new.delete.array/new.size_align.pass.cpp | 3 +++ .../new.delete.single/new.size.except.pass.cpp | 3 +++ .../new.delete.single/new.size.pass.cpp | 3 +++ .../new.size_align.except.pass.cpp | 3 +++ .../new.delete.single/new.size_align.pass.cpp | 3 +++ .../rand.dist.samp.discrete/ctor_func.pass.cpp | 3 +++ .../param_ctor_func.pass.cpp | 3 +++ .../range.lazy.split/general.pass.cpp | 12 ++++++++++++ .../expected.expected/monadic/transform.pass.cpp | 4 ++-- .../monadic/transform_error.pass.cpp | 4 ++-- .../monadic/transform_error.pass.cpp | 4 ++-- .../formatter.char_array.pass.cpp | 2 +- .../meta/meta.rel/is_virtual_base_of.pass.cpp | 7 +++++++ ...le.pass.cpp => dependent_return_type.pass.cpp} | 4 ++++ .../meta.unary.prop/is_implicit_lifetime.pass.cpp | 2 +- .../make_optional_explicit.pass.cpp | 3 +++ ...ke_optional_explicit_initializer_list.pass.cpp | 3 +++ .../tuple.tuple/tuple.cnstr/PR31384.pass.cpp | 2 +- .../catch_member_function_pointer_02.pass.cpp | 2 +- 30 files changed, 119 insertions(+), 17 deletions(-) rename libcxx/test/std/utilities/meta/meta.unary/{dependent_return_type.compile.pass.cpp => dependent_return_type.pass.cpp} (94%) diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml index 80f2432b78dea..f0bdf6c0b5899 100644 --- a/.github/workflows/libcxx-build-and-test.yaml +++ b/.github/workflows/libcxx-build-and-test.yaml @@ -52,8 +52,8 @@ jobs: cxx: [ 'clang++-21' ] include: - config: 'generic-gcc' - cc: 'gcc-14' - cxx: 'g++-14' + cc: 'gcc-15' + cxx: 'g++-15' steps: - uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2 - name: ${{ matrix.config }}.${{ matrix.cxx }} @@ -92,8 +92,8 @@ jobs: cxx: [ 'clang++-21' ] include: - config: 'generic-gcc-cxx11' - cc: 'gcc-14' - cxx: 'g++-14' + cc: 'gcc-15' + cxx: 'g++-15' - config: 'generic-cxx26' cc: 'clang-20' cxx: 'clang++-20' diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst index 9c957e9d20cb7..ae9cc87c797f8 100644 --- a/libcxx/docs/index.rst +++ b/libcxx/docs/index.rst @@ -135,7 +135,7 @@ Compiler Versions Restrictions Support policy Clang 19, 20, 21-git latest two stable releases per `LLVM's release page `_ and the development version AppleClang 15 latest stable release per `Xcode's release page `_ Open XL 17.1.3 (AIX) latest stable release per `Open XL's documentation page `_ -GCC 14 In C++11 or later only latest stable release per `GCC's release page `_ +GCC 15 In C++11 or later only latest stable release per `GCC's release page `_ ============ =================== ========================== ===================== Libc++ also supports common platforms and architectures: diff --git a/libcxx/src/experimental/time_zone.cpp b/libcxx/src/experimental/time_zone.cpp index 289164ab12036..a735800b60317 100644 --- a/libcxx/src/experimental/time_zone.cpp +++ b/libcxx/src/experimental/time_zone.cpp @@ -29,6 +29,15 @@ // These quirks often use a 12h interval; this is the scan interval of zdump, // which implies there are no sys_info objects with a duration of less than 12h. +// Work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120502 + +#include <__config> + +// TODO(LLVM 23): When upgrading to GCC 16 this can be removed +#ifdef _LIBCPP_COMPILER_GCC +# pragma GCC optimize("-O0") +#endif + #include #include #include diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp index 08d8e119a4d24..1e89cd272e643 100644 --- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp +++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.contains/ranges.contains.pass.cpp @@ -195,7 +195,7 @@ constexpr bool test() { std::string a[] = {str1, str1, str, str1, str1}; auto whole = std::ranges::subrange(forward_iterator(std::move_iterator(a)), forward_iterator(std::move_iterator(a + 5))); - bool ret = std::ranges::contains(whole.begin(), whole.end(), "hello world", [&](const std::string i) { + bool ret = std::ranges::contains(whole.begin(), whole.end(), +"hello world", [&](const std::string i) { ++projection_count; return i; }); @@ -207,7 +207,7 @@ constexpr bool test() { std::string a[] = {str1, str1, str, str1, str1}; auto whole = std::ranges::subrange(forward_iterator(std::move_iterator(a)), forward_iterator(std::move_iterator(a + 5))); - bool ret = std::ranges::contains(whole, "hello world", [&](const std::string i) { + bool ret = std::ranges::contains(whole, +"hello world", [&](const std::string i) { ++projection_count; return i; }); diff --git a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp index ca0f40eb77d49..0531c0e096a13 100644 --- a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable.compile.pass.cpp @@ -26,6 +26,7 @@ #include #include "compare_types.h" +#include "test_macros.h" namespace fundamentals { static_assert(std::equality_comparable); @@ -43,7 +44,12 @@ static_assert(std::equality_comparable); static_assert(std::equality_comparable); static_assert(std::equality_comparable); static_assert(std::equality_comparable); +// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet. +#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG) static_assert(std::equality_comparable); +#else +static_assert(!std::equality_comparable); +#endif static_assert(std::equality_comparable); static_assert(std::equality_comparable); static_assert(std::equality_comparable); diff --git a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp index 0afbe582ba896..2f8d7862c0f4d 100644 --- a/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.compare/concept.equalitycomparable/equality_comparable_with.compile.pass.cpp @@ -107,7 +107,12 @@ static_assert(!check_equality_comparable_with < int, int (S::*)() const volatile&& noexcept > ()); static_assert(check_equality_comparable_with()); +// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet. +#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG) static_assert(check_equality_comparable_with()); +#else +static_assert(!check_equality_comparable_with()); +#endif static_assert(!check_equality_comparable_with()); static_assert(!check_equality_comparable_with()); static_assert(!check_equality_comparable_with()); @@ -148,7 +153,12 @@ static_assert( static_assert(!check_equality_comparable_with < int*, int (S::*)() const volatile&& noexcept > ()); +// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet. +#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG) static_assert(check_equality_comparable_with()); +#else +static_assert(!check_equality_comparable_with()); +#endif static_assert(!check_equality_comparable_with()); static_assert(!check_equality_comparable_with()); static_assert(!check_equality_comparable_with()); @@ -942,7 +952,12 @@ static_assert( static_assert(!check_equality_comparable_with()); static_assert(check_equality_comparable_with()); +// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet. +#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG) static_assert(check_equality_comparable_with()); +#else +static_assert(!check_equality_comparable_with()); +#endif static_assert(check_equality_comparable_with()); static_assert(check_equality_comparable_with()); static_assert(check_equality_comparable_with()); diff --git a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp index 6f8324eaf7647..5959f70cf3963 100644 --- a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered.compile.pass.cpp @@ -55,7 +55,10 @@ static_assert(models_totally_ordered()); static_assert(models_totally_ordered()); static_assert(models_totally_ordered()); static_assert(models_totally_ordered()); +// Array comparisons are ill-formed in C++26 +#if TEST_STD_VER <= 23 static_assert(models_totally_ordered()); +#endif static_assert(models_totally_ordered()); static_assert(models_totally_ordered()); static_assert(models_totally_ordered()); diff --git a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp index dffc33265aebf..398ef445baf9d 100644 --- a/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp +++ b/libcxx/test/std/concepts/concepts.compare/concepts.totallyordered/totally_ordered_with.compile.pass.cpp @@ -89,7 +89,12 @@ static_assert(!check_totally_ordered_with()) static_assert(!check_totally_ordered_with < int, int (S::*)() const volatile&& noexcept > ()); static_assert(check_totally_ordered_with()); +// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet. +#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG) static_assert(check_totally_ordered_with()); +#else +static_assert(!check_totally_ordered_with()); +#endif static_assert(!check_totally_ordered_with()); static_assert(!check_totally_ordered_with()); static_assert(!check_totally_ordered_with()); @@ -117,7 +122,12 @@ static_assert(!check_totally_ordered_with < int*, int (S::*)() volatile&& noexce static_assert(!check_totally_ordered_with()); static_assert(!check_totally_ordered_with < int*, int (S::*)() const volatile&& noexcept > ()); +// Array comparisons are ill-formed in C++26, but Clang doesn't implement this yet. +#if TEST_STD_VER <= 23 || defined(TEST_COMPILER_CLANG) static_assert(check_totally_ordered_with()); +#else +static_assert(!check_totally_ordered_with()); +#endif static_assert(!check_totally_ordered_with()); static_assert(!check_totally_ordered_with()); static_assert(!check_totally_ordered_with()); diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp index 6a2b098c1b573..9ee32b8417832 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-exceptions // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + #include #include #include diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp index 437d064307735..4fdcc3b535a8d 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.pass.cpp @@ -11,6 +11,9 @@ // asan and msan will not call the new handler. // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + #include #include #include diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp index 4e34ebcb46c7d..4dfaf7a30d7a2 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-exceptions // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + // Libc++ when built for z/OS doesn't contain the aligned allocation functions, // nor does the dynamic library shipped with z/OS. // XFAIL: target={{.+}}-zos{{.*}} diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp index c9b59ecaff396..a1b8466340a2a 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.pass.cpp @@ -13,6 +13,9 @@ // asan and msan will not call the new handler. // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + // Libc++ when built for z/OS doesn't contain the aligned allocation functions, // nor does the dynamic library shipped with z/OS. // XFAIL: target={{.+}}-zos{{.*}} diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp index 6a515555e6dbd..346e881d016be 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-exceptions // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + #include #include #include diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp index 729ef3ec46b0c..0013dd3d0cbc3 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.pass.cpp @@ -11,6 +11,9 @@ // asan and msan will not call the new handler. // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + #include #include #include diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp index 7694314c87bf3..fbeb880c83d8d 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp @@ -9,6 +9,9 @@ // UNSUPPORTED: no-exceptions // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + // Libc++ when built for z/OS doesn't contain the aligned allocation functions, // nor does the dynamic library shipped with z/OS. // XFAIL: target={{.+}}-zos{{.*}} diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp index 5d321f08282b2..59ecbe205513a 100644 --- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp +++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.pass.cpp @@ -13,6 +13,9 @@ // asan and msan will not call the new handler. // UNSUPPORTED: sanitizer-new-delete +// GCC warns about allocating numeric_limits::max() being too large (which we test here) +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + // Libc++ when built for z/OS doesn't contain the aligned allocation functions, // nor does the dynamic library shipped with z/OS. // XFAIL: target={{.+}}-zos{{.*}} diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp index c3a88af92d360..c05a9434175a8 100644 --- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/ctor_func.pass.cpp @@ -15,6 +15,9 @@ // discrete_distribution(size_t nw, double xmin, double xmax, // UnaryOperation fw); +// There is a bogus diagnostic about a too large allocation +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + #include #include diff --git a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp index 7ef936b7fc355..206bf5a0eb8a2 100644 --- a/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp +++ b/libcxx/test/std/numerics/rand/rand.dist/rand.dist.samp/rand.dist.samp.discrete/param_ctor_func.pass.cpp @@ -15,6 +15,9 @@ // param_type(size_t nw, double xmin, double xmax, // UnaryOperation fw); +// There is a bogus diagnostic about a too large allocation +// ADDITIONAL_COMPILE_FLAGS(gcc): -Wno-alloc-size-larger-than + #include #include diff --git a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp index f4e87bb47399e..521c0b1610bce 100644 --- a/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp +++ b/libcxx/test/std/ranges/range.adaptors/range.lazy.split/general.pass.cpp @@ -312,7 +312,10 @@ constexpr bool main_test() { // Leading separator. { std::array expected = {""sv, "abc"sv, "def"sv}; +// FIXME: Why does GCC complain here? +#ifndef TEST_COMPILER_GCC test_one(" abc def"sv, short_sep, expected); +#endif test_one("12abc12def"sv, long_sep, expected); } @@ -326,7 +329,10 @@ constexpr bool main_test() { // Input consisting of a single separator. { std::array expected = {""sv, ""sv}; +// FIXME: Why does GCC complain here? +#ifndef TEST_COMPILER_GCC test_one(" "sv, short_sep, expected); +#endif test_one("12"sv, long_sep, expected); } @@ -354,7 +360,10 @@ constexpr bool main_test() { // Separators after every character. { std::array expected = {""sv, "a"sv, "b"sv, "c"sv, ""sv}; +// FIXME: Why does GCC complain here? +#ifndef TEST_COMPILER_GCC test_one(" a b c "sv, short_sep, expected); +#endif test_one("12a12b12c12"sv, long_sep, expected); } @@ -383,7 +392,10 @@ constexpr bool main_test() { // Terminating null as a separator. { std::array expected = {"abc"sv, "def"sv}; +// FIXME: Why does GCC complain here? +#ifndef TEST_COMPILER_GCC test_one("abc\0def"sv, '\0', expected); +#endif test_one("abc\0\0def"sv, "\0\0"sv, expected); } diff --git a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp index cbd54d623c0f4..97c1e4a40f355 100644 --- a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp +++ b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp @@ -9,8 +9,8 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`, -// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333 -// XFAIL: gcc-14 +// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995 +// XFAIL: gcc-14, gcc-15 // diff --git a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp index a19e17b01f6a9..9570b2faac692 100644 --- a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp +++ b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp @@ -9,8 +9,8 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`, -// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333. -// XFAIL: gcc-14 +// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995. +// XFAIL: gcc-14, gcc-15 // diff --git a/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp index f6d3011d1ea96..2ec15b51d11ea 100644 --- a/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp +++ b/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp @@ -9,8 +9,8 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`, -// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333 -// XFAIL: gcc-14 +// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=98995 +// XFAIL: gcc-14, gcc-15 // diff --git a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp index bc056db9e254e..8c4f3000ec1e8 100644 --- a/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp +++ b/libcxx/test/std/utilities/format/format.formatter/format.formatter.spec/formatter.char_array.pass.cpp @@ -8,7 +8,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17 // TODO FMT __builtin_memcpy isn't constexpr in GCC -// UNSUPPORTED: gcc-14 +// UNSUPPORTED: gcc-14, gcc-15 // diff --git a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp index f443d2030961d..47c95c64a0855 100644 --- a/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.rel/is_virtual_base_of.pass.cpp @@ -18,6 +18,8 @@ #include #include +#include "test_macros.h" + template void test() { // Test the type of the variables @@ -98,8 +100,13 @@ int main(int, char**) { // Test with virtual inheritance { +#ifdef TEST_COMPILER_GCC // FIXME: Is this a GCC or Clang bug? Or is the standards wording ambiguous? + test(); + test(); +#else test(); test(); +#endif test(); test(); test(); diff --git a/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.compile.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.pass.cpp similarity index 94% rename from libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.compile.pass.cpp rename to libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.pass.cpp index 935a6e3db0017..37d66831c7ce5 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.compile.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/dependent_return_type.pass.cpp @@ -168,3 +168,7 @@ void instantiate() { void_t(); #endif } + +// This is not a .compile.pass.cpp because we want to ensure that GCC doesn't complain about incorrect builtins usage, +// which only happens during CodeGen. +int main(int, char**) { return 0; } diff --git a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp index 681ad13a07dfd..afd76e65060e3 100644 --- a/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp +++ b/libcxx/test/std/utilities/meta/meta.unary/meta.unary.prop/is_implicit_lifetime.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20 // These compilers don't support __builtin_is_implicit_lifetime yet. -// UNSUPPORTED: clang-18, clang-19, gcc-14, apple-clang-15, apple-clang-16, apple-clang-17 +// UNSUPPORTED: clang-18, clang-19, gcc-14, gcc-15, apple-clang-15, apple-clang-16, apple-clang-17 // diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp index e7931e07e31d1..23f131d2fc499 100644 --- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit.pass.cpp @@ -12,6 +12,9 @@ // template // constexpr optional make_optional(Args&&... args); +// GCC crashes on this file, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120577 +// XFAIL: gcc-15 + #include #include #include diff --git a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp index 80371d6333712..5ddb229ad9268 100644 --- a/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp +++ b/libcxx/test/std/utilities/optional/optional.specalg/make_optional_explicit_initializer_list.pass.cpp @@ -12,6 +12,9 @@ // template // constexpr optional make_optional(initializer_list il, Args&&... args); +// GCC crashes on this file, see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=120577 +// XFAIL: gcc-15 + #include #include #include diff --git a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp index e6812e9a3a30a..ae5984c155300 100644 --- a/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp +++ b/libcxx/test/std/utilities/tuple/tuple.tuple/tuple.cnstr/PR31384.pass.cpp @@ -9,7 +9,7 @@ // UNSUPPORTED: c++03 // FIXME: Why does this start to fail with GCC 14? -// XFAIL: !(c++11 || c++14) && gcc-14 +// XFAIL: !(c++11 || c++14) && (gcc-14 || gcc-15) // See https://llvm.org/PR31384. diff --git a/libcxxabi/test/catch_member_function_pointer_02.pass.cpp b/libcxxabi/test/catch_member_function_pointer_02.pass.cpp index 5d702031ce352..ec400713620c1 100644 --- a/libcxxabi/test/catch_member_function_pointer_02.pass.cpp +++ b/libcxxabi/test/catch_member_function_pointer_02.pass.cpp @@ -12,7 +12,7 @@ // GCC supports noexcept function types but this test still fails. // This is likely a bug in their implementation. Investigation needed. -// XFAIL: gcc-14 +// XFAIL: gcc-14, gcc-15 #include From 806333063ff9a09ca001dcd77d4d5d6f0b9ecd74 Mon Sep 17 00:00:00 2001 From: Jesse Huang Date: Thu, 12 Jun 2025 02:24:10 +0800 Subject: [PATCH 120/851] [RISCV] Guard the alternative static chain register use on ILP32E/LP64E (#142715) Asserts the use of t3(x28) as the static chain register when branch control flow protection is enabled with ILP32E/LP64E, because such register is not present within the ABI. --- llvm/lib/Target/RISCV/RISCVCallingConv.cpp | 24 ++++++++++++++-------- llvm/test/CodeGen/RISCV/nest-register.ll | 3 +++ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp index e0d1fb2facc87..cb6117eb0917b 100644 --- a/llvm/lib/Target/RISCV/RISCVCallingConv.cpp +++ b/llvm/lib/Target/RISCV/RISCVCallingConv.cpp @@ -333,15 +333,23 @@ bool llvm::CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, unsigned XLen = Subtarget.getXLen(); MVT XLenVT = Subtarget.getXLenVT(); - // Static chain parameter must not be passed in normal argument registers, - // so we assign t2/t3 for it as done in GCC's __builtin_call_with_static_chain - bool HasCFBranch = - Subtarget.hasStdExtZicfilp() && - MF.getFunction().getParent()->getModuleFlag("cf-protection-branch"); - // Normal: t2, Branch control flow protection: t3 - const auto StaticChainReg = HasCFBranch ? RISCV::X28 : RISCV::X7; - if (ArgFlags.isNest()) { + // Static chain parameter must not be passed in normal argument registers, + // so we assign t2/t3 for it as done in GCC's + // __builtin_call_with_static_chain + bool HasCFBranch = + Subtarget.hasStdExtZicfilp() && + MF.getFunction().getParent()->getModuleFlag("cf-protection-branch"); + + // Normal: t2, Branch control flow protection: t3 + const auto StaticChainReg = HasCFBranch ? RISCV::X28 : RISCV::X7; + + RISCVABI::ABI ABI = Subtarget.getTargetABI(); + if (HasCFBranch && + (ABI == RISCVABI::ABI_ILP32E || ABI == RISCVABI::ABI_LP64E)) + reportFatalUsageError( + "Nested functions with control flow protection are not " + "usable with ILP32E or LP64E ABI."); if (MCRegister Reg = State.AllocateReg(StaticChainReg)) { State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo)); return false; diff --git a/llvm/test/CodeGen/RISCV/nest-register.ll b/llvm/test/CodeGen/RISCV/nest-register.ll index 9f8e4e1a2d8d3..6e892e05c4297 100644 --- a/llvm/test/CodeGen/RISCV/nest-register.ll +++ b/llvm/test/CodeGen/RISCV/nest-register.ll @@ -5,6 +5,8 @@ ; RUN: | FileCheck -check-prefix=RV64I %s ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zicfilp -verify-machineinstrs < %s \ ; RUN: | FileCheck -check-prefix=RV64I-ZICFILP %s +; RUN: not llc -mtriple=riscv64 -target-abi=lp64e -mattr=+experimental-zicfilp \ +; RUN: -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=LP64E-ZICFILP %s ; Tests that the 'nest' parameter attribute causes the relevant parameter to be ; passed in the right register. @@ -63,6 +65,7 @@ define ptr @nest_caller(ptr %arg) nounwind { ret ptr %result } +; LP64E-ZICFILP: LLVM ERROR: Nested functions with control flow protection are not usable with ILP32E or LP64E ABI. !llvm.module.flags = !{!0} !0 = !{i32 8, !"cf-protection-branch", i32 1} From 7a0c9f607a26b77a7e584fd6734f03b7ee40ca95 Mon Sep 17 00:00:00 2001 From: Tony Varghese Date: Wed, 11 Jun 2025 23:56:15 +0530 Subject: [PATCH 121/851] [NFC][PowerPC] Pre-commit test case for exploitation of xxeval for the pattern ternary(A,X,or(B,C)) (#143693) Pre-commit test case for exploitation of `xxeval` for ternary operations of the pattern `ternary(A,X,or(B,C))`. Exploitation of `xxeval` to be added later. Co-authored-by: Tony Varghese --- .../CodeGen/PowerPC/xxeval-vselect-x-or.ll | 268 ++++++++++++++++++ 1 file changed, 268 insertions(+) create mode 100644 llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll diff --git a/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll new file mode 100644 index 0000000000000..1ad7e95e3682e --- /dev/null +++ b/llvm/test/CodeGen/PowerPC/xxeval-vselect-x-or.ll @@ -0,0 +1,268 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; Test file to verify the emission of Vector selection instructions when ternary operators are used. + +; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64le-unknown-unknown \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc-ibm-aix-xcoff \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; RUN: llc -verify-machineinstrs -mcpu=pwr10 -mtriple=powerpc64-ibm-aix-xcoff \ +; RUN: -ppc-asm-full-reg-names --ppc-vsr-nums-as-vr < %s | FileCheck %s + +; Function to test ternary(A, and(B, C), or(B, C)) for <4 x i32> +define <4 x i32> @ternary_A_and_BC_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: ternary_A_and_BC_or_BC_4x32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxleqv v5, v5, v5 +; CHECK-NEXT: xxland vs0, v3, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: vslw v2, v2, v5 +; CHECK-NEXT: vsraw v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %and = and <4 x i32> %B, %C + %or = or <4 x i32> %B, %C + %res = select <4 x i1> %A, <4 x i32> %and, <4 x i32> %or + ret <4 x i32> %res +} + +; Function to test ternary(A, and(B, C), or(B, C)) for <2 x i64> +define <2 x i64> @ternary_A_and_BC_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: ternary_A_and_BC_or_BC_2x64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v5, v5, v5 +; CHECK-NEXT: xxland vs0, v3, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: xxsplti32dx v5, 1, 63 +; CHECK-NEXT: vsld v2, v2, v5 +; CHECK-NEXT: vsrad v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %and = and <2 x i64> %B, %C + %or = or <2 x i64> %B, %C + %res = select <2 x i1> %A, <2 x i64> %and, <2 x i64> %or + ret <2 x i64> %res +} + +; Function to test ternary(A, B, or(B, C)) for <4 x i32> +define <4 x i32> @ternary_A_B_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: ternary_A_B_or_BC_4x32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxleqv v5, v5, v5 +; CHECK-NEXT: xxlor vs0, v3, v4 +; CHECK-NEXT: vslw v2, v2, v5 +; CHECK-NEXT: vsraw v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs0, v3, v2 +; CHECK-NEXT: blr +entry: + %or = or <4 x i32> %B, %C + %res = select <4 x i1> %A, <4 x i32> %B, <4 x i32> %or + ret <4 x i32> %res +} + +; Function to test ternary(A, B, or(B, C)) for <2 x i64> +define <2 x i64> @ternary_A_B_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: ternary_A_B_or_BC_2x64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v5, v5, v5 +; CHECK-NEXT: xxlor vs0, v3, v4 +; CHECK-NEXT: xxsplti32dx v5, 1, 63 +; CHECK-NEXT: vsld v2, v2, v5 +; CHECK-NEXT: vsrad v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs0, v3, v2 +; CHECK-NEXT: blr +entry: + %or = or <2 x i64> %B, %C + %res = select <2 x i1> %A, <2 x i64> %B, <2 x i64> %or + ret <2 x i64> %res +} + + +; Function to test ternary(A, C, or(B, C)) for <4 x i32> +define <4 x i32> @ternary_A_C_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: ternary_A_C_or_BC_4x32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxleqv v5, v5, v5 +; CHECK-NEXT: xxlor vs0, v3, v4 +; CHECK-NEXT: vslw v2, v2, v5 +; CHECK-NEXT: vsraw v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs0, v4, v2 +; CHECK-NEXT: blr +entry: + %or = or <4 x i32> %B, %C + %res = select <4 x i1> %A, <4 x i32> %C, <4 x i32> %or + ret <4 x i32> %res +} + +; Function to test ternary(A, C, or(B, C)) for <2 x i64> +define <2 x i64> @ternary_A_C_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: ternary_A_C_or_BC_2x64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v5, v5, v5 +; CHECK-NEXT: xxlor vs0, v3, v4 +; CHECK-NEXT: xxsplti32dx v5, 1, 63 +; CHECK-NEXT: vsld v2, v2, v5 +; CHECK-NEXT: vsrad v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs0, v4, v2 +; CHECK-NEXT: blr +entry: + %or = or <2 x i64> %B, %C + %res = select <2 x i1> %A, <2 x i64> %C, <2 x i64> %or + ret <2 x i64> %res +} + + +; Function to test ternary(A, eqv(B,C), or(B, C)) for <4 x i32> +define <4 x i32> @ternary_A_eqv_BC_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: ternary_A_eqv_BC_or_BC_4x32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxleqv v5, v5, v5 +; CHECK-NEXT: xxleqv vs0, v3, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: vslw v2, v2, v5 +; CHECK-NEXT: vsraw v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %xor = xor <4 x i32> %B, %C + %eqv = xor <4 x i32> %xor, ; Vector eqv operation + %or = or <4 x i32> %B, %C + %res = select <4 x i1> %A, <4 x i32> %eqv, <4 x i32> %or + ret <4 x i32> %res +} + +; Function to test ternary(A, eqv(B,C), or(B, C)) for <2 x i64> +define <2 x i64> @ternary_A_eqv_BC_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: ternary_A_eqv_BC_or_BC_2x64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v5, v5, v5 +; CHECK-NEXT: xxleqv vs0, v3, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: xxsplti32dx v5, 1, 63 +; CHECK-NEXT: vsld v2, v2, v5 +; CHECK-NEXT: vsrad v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %xor = xor <2 x i64> %B, %C + %eqv = xor <2 x i64> %xor, ; Vector eqv operation + %or = or <2 x i64> %B, %C + %res = select <2 x i1> %A, <2 x i64> %eqv, <2 x i64> %or + ret <2 x i64> %res +} + +; Function to test ternary(A, not(C), or(B, C)) for <4 x i32> +define <4 x i32> @ternary_A_not_C_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: ternary_A_not_C_or_BC_4x32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxleqv v5, v5, v5 +; CHECK-NEXT: xxlnor vs0, v4, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: vslw v2, v2, v5 +; CHECK-NEXT: vsraw v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %not = xor <4 x i32> %C, ; Vector not operation + %or = or <4 x i32> %B, %C + %res = select <4 x i1> %A, <4 x i32> %not, <4 x i32> %or + ret <4 x i32> %res +} + +; Function to test ternary(A, not(C), or(B, C)) for <2 x i64> +define <2 x i64> @ternary_A_not_C_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: ternary_A_not_C_or_BC_2x64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v5, v5, v5 +; CHECK-NEXT: xxlnor vs0, v4, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: xxsplti32dx v5, 1, 63 +; CHECK-NEXT: vsld v2, v2, v5 +; CHECK-NEXT: vsrad v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %not = xor <2 x i64> %C, ; Vector not operation + %or = or <2 x i64> %B, %C + %res = select <2 x i1> %A, <2 x i64> %not, <2 x i64> %or + ret <2 x i64> %res +} + +; Function to test ternary(A, not(B), or(B, C)) for <4 x i32> +define <4 x i32> @ternary_A_not_B_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: ternary_A_not_B_or_BC_4x32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxleqv v5, v5, v5 +; CHECK-NEXT: xxlnor vs0, v3, v3 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: vslw v2, v2, v5 +; CHECK-NEXT: vsraw v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %not = xor <4 x i32> %B, ; Vector not operation + %or = or <4 x i32> %B, %C + %res = select <4 x i1> %A, <4 x i32> %not, <4 x i32> %or + ret <4 x i32> %res +} + +; Function to test ternary(A, not(B), or(B, C)) for <2 x i64> +define <2 x i64> @ternary_A_not_B_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: ternary_A_not_B_or_BC_2x64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v5, v5, v5 +; CHECK-NEXT: xxlnor vs0, v3, v3 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: xxsplti32dx v5, 1, 63 +; CHECK-NEXT: vsld v2, v2, v5 +; CHECK-NEXT: vsrad v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %not = xor <2 x i64> %B, ; Vector not operation + %or = or <2 x i64> %B, %C + %res = select <2 x i1> %A, <2 x i64> %not, <2 x i64> %or + ret <2 x i64> %res +} + +; Function to test ternary(A, nand(B,C), or(B, C)) for <4 x i32> +define <4 x i32> @ternary_A_nand_BC_or_BC_4x32(<4 x i1> %A, <4 x i32> %B, <4 x i32> %C) { +; CHECK-LABEL: ternary_A_nand_BC_or_BC_4x32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxleqv v5, v5, v5 +; CHECK-NEXT: xxlnand vs0, v3, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: vslw v2, v2, v5 +; CHECK-NEXT: vsraw v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %and = and <4 x i32> %B, %C + %nand = xor <4 x i32> %and, ; Vector nand operation + %or = or <4 x i32> %B, %C + %res = select <4 x i1> %A, <4 x i32> %nand, <4 x i32> %or + ret <4 x i32> %res +} + +; Function to test ternary(A, nand(B,C), or(B, C)) for <2 x i64> +define <2 x i64> @ternary_A_nand_BC_or_BC_2x64(<2 x i1> %A, <2 x i64> %B, <2 x i64> %C) { +; CHECK-LABEL: ternary_A_nand_BC_or_BC_2x64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xxlxor v5, v5, v5 +; CHECK-NEXT: xxlnand vs0, v3, v4 +; CHECK-NEXT: xxlor vs1, v3, v4 +; CHECK-NEXT: xxsplti32dx v5, 1, 63 +; CHECK-NEXT: vsld v2, v2, v5 +; CHECK-NEXT: vsrad v2, v2, v5 +; CHECK-NEXT: xxsel v2, vs1, vs0, v2 +; CHECK-NEXT: blr +entry: + %and = and <2 x i64> %B, %C + %nand = xor <2 x i64> %and, ; Vector nand operation + %or = or <2 x i64> %B, %C + %res = select <2 x i1> %A, <2 x i64> %nand, <2 x i64> %or + ret <2 x i64> %res +} From 8d7da9a2a40302af25ee70841a4b549f4ed5ee8a Mon Sep 17 00:00:00 2001 From: Yifei Xu Date: Wed, 11 Jun 2025 13:33:23 -0500 Subject: [PATCH 122/851] Update BUILD.bazel Add missing dependency after https://github.com/llvm/llvm-project/pull/142916. --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index c1d63de04b8f0..f6a7cd7dea85b 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -6882,6 +6882,7 @@ cc_library( ":SPIRVDialect", ":Support", "//llvm:config", + "//llvm:Support", ], ) From 773d357b9882fe0e30ffddee5ac1fbe2254fac05 Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 11 Jun 2025 20:39:45 +0200 Subject: [PATCH 123/851] [libc++] Simplify the implementation of __next_prime a bit (#143512) --- libcxx/src/hash.cpp | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/libcxx/src/hash.cpp b/libcxx/src/hash.cpp index 41c4eb480a5fc..50d8cf9f9f539 100644 --- a/libcxx/src/hash.cpp +++ b/libcxx/src/hash.cpp @@ -9,7 +9,6 @@ #include <__hash_table> #include #include -#include _LIBCPP_CLANG_DIAGNOSTIC_IGNORED("-Wtautological-constant-out-of-range-compare") @@ -52,16 +51,15 @@ const unsigned indices[] = { // are fewer potential primes to search, and fewer potential primes to divide // against. -template -inline _LIBCPP_HIDE_FROM_ABI typename enable_if<_Sz == 4, void>::type __check_for_overflow(size_t N) { - if (N > 0xFFFFFFFB) - std::__throw_overflow_error("__next_prime overflow"); -} - -template -inline _LIBCPP_HIDE_FROM_ABI typename enable_if<_Sz == 8, void>::type __check_for_overflow(size_t N) { - if (N > 0xFFFFFFFFFFFFFFC5ull) - std::__throw_overflow_error("__next_prime overflow"); +inline void __check_for_overflow(size_t N) { + if constexpr (sizeof(size_t) == 4) { + if (N > 0xFFFFFFFB) + std::__throw_overflow_error("__next_prime overflow"); + } else { + static_assert(sizeof(size_t) == 8); + if (N > 0xFFFFFFFFFFFFFFC5ull) + std::__throw_overflow_error("__next_prime overflow"); + } } size_t __next_prime(size_t n) { From 8dc63ca59003a4b72217221c1c801237614c9d7d Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 11 Jun 2025 11:47:09 -0700 Subject: [PATCH 124/851] Make clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c write output file to temp dir --- clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c b/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c index 5d65fdafaa251..d761e12e8392e 100644 --- a/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c +++ b/clang/test/Frontend/aarch64-print-enabled-extensions-cc1.c @@ -57,7 +57,7 @@ // RUN: | FileCheck --strict-whitespace --implicit-check-not=FEAT_ %s --check-prefix=DISABLE_VIA_XCLANG // However, sve2 is actually enabled in clang but disabled for MC. -// RUN: %clang --target=aarch64 -march=armv8-a+sve2 -c %s \ +// RUN: %clang --target=aarch64 -march=armv8-a+sve2 -c %s -o %t \ // RUN: -Xclang -target-feature -Xclang -sve \ // RUN: -Xclang -verify -Xclang -verify-ignore-unexpected=note From 0c62571d9f02f7d5c1a649b5b20fdf5b0f6bb41c Mon Sep 17 00:00:00 2001 From: Nikolas Klauser Date: Wed, 11 Jun 2025 20:57:07 +0200 Subject: [PATCH 125/851] [libc++] Remove static_assert from hash.cpp that fires unconditionall --- libcxx/src/hash.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/libcxx/src/hash.cpp b/libcxx/src/hash.cpp index 50d8cf9f9f539..e1e6d2b4c2bdb 100644 --- a/libcxx/src/hash.cpp +++ b/libcxx/src/hash.cpp @@ -56,7 +56,6 @@ inline void __check_for_overflow(size_t N) { if (N > 0xFFFFFFFB) std::__throw_overflow_error("__next_prime overflow"); } else { - static_assert(sizeof(size_t) == 8); if (N > 0xFFFFFFFFFFFFFFC5ull) std::__throw_overflow_error("__next_prime overflow"); } From 02b6849cf1feb425885bf6f5ee505d5cd4a824d7 Mon Sep 17 00:00:00 2001 From: Abhinav Gaba Date: Wed, 11 Jun 2025 12:03:55 -0700 Subject: [PATCH 126/851] [Clang][OpenMP] Fix mapping of arrays of structs with members with mappers (#142511) This builds upon #101101 from @jyu2-git, which used compiler-generated mappers when mapping an array-section of structs with members that have user-defined default mappers. Now we do the same when mapping arrays of structs. --- clang/docs/ReleaseNotes.rst | 3 + clang/lib/Sema/SemaOpenMP.cpp | 38 ++- ...of_structs_with_nested_mapper_ast_dump.cpp | 34 ++ ..._of_structs_with_nested_mapper_codegen.cpp | 323 ++++++++++++++++++ ...f_structs_with_nested_mapper_ast_dump.cpp} | 0 ...of_structs_with_nested_mapper_codegen.cpp} | 0 ...re_mapper_nested_default_mappers_array.cpp | 6 +- 7 files changed, 388 insertions(+), 16 deletions(-) create mode 100644 clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp create mode 100644 clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp rename clang/test/OpenMP/{target_map_nest_defalut_mapper_ast_dump.cpp => target_map_array_section_of_structs_with_nested_mapper_ast_dump.cpp} (100%) rename clang/test/OpenMP/{target_map_nest_defalut_mapper_codegen.cpp => target_map_array_section_of_structs_with_nested_mapper_codegen.cpp} (100%) diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index b5e6cf088a4b1..8043ab48f0b4f 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -1102,6 +1102,9 @@ OpenMP Support - An error is now emitted when OpenMP ``collapse`` and ``ordered`` clauses have an argument larger than what can fit within a 64-bit integer. - Added support for private variable reduction. +- Fixed mapping of arrays of structs containing nested structs with user defined + mappers, by using compiler-generated default mappers for the outer structs for + such maps. Improvements ^^^^^^^^^^^^ diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index a3395ac157d96..2cbe79c5c07ca 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -22057,20 +22057,34 @@ static void checkMappableExpressionList( Type.getCanonicalType(), UnresolvedMapper); if (ER.isInvalid()) continue; - if (!ER.get() && isa(VE)) { - // Create implicit mapper as needed. - QualType BaseType = VE->getType().getCanonicalType(); - if (BaseType->isSpecificBuiltinType(BuiltinType::ArraySection)) { - const auto *OASE = cast(VE->IgnoreParenImpCasts()); - QualType BType = ArraySectionExpr::getBaseOriginalType(OASE->getBase()); - QualType ElemType; - if (const auto *ATy = BType->getAsArrayTypeUnsafe()) - ElemType = ATy->getElementType(); - else - ElemType = BType->getPointeeType(); + + // If no user-defined mapper is found, we need to create an implicit one for + // arrays/array-sections on structs that have members that have + // user-defined mappers. This is needed to ensure that the mapper for the + // member is invoked when mapping each element of the array/array-section. + if (!ER.get()) { + QualType BaseType; + + if (isa(VE)) { + BaseType = VE->getType().getCanonicalType(); + if (BaseType->isSpecificBuiltinType(BuiltinType::ArraySection)) { + const auto *OASE = cast(VE->IgnoreParenImpCasts()); + QualType BType = + ArraySectionExpr::getBaseOriginalType(OASE->getBase()); + QualType ElemType; + if (const auto *ATy = BType->getAsArrayTypeUnsafe()) + ElemType = ATy->getElementType(); + else + ElemType = BType->getPointeeType(); + BaseType = ElemType.getCanonicalType(); + } + } else if (VE->getType()->isArrayType()) { + const ArrayType *AT = VE->getType()->getAsArrayTypeUnsafe(); + const QualType ElemType = AT->getElementType(); BaseType = ElemType.getCanonicalType(); } - if (BaseType->getAsRecordDecl() && + + if (!BaseType.isNull() && BaseType->getAsRecordDecl() && isImplicitMapperNeeded(SemaRef, DSAS, BaseType, VE)) { ER = buildImplicitMapper(SemaRef, BaseType, DSAS); } diff --git a/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp new file mode 100644 index 0000000000000..a5847709d3e76 --- /dev/null +++ b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_ast_dump.cpp @@ -0,0 +1,34 @@ +//RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -fopenmp -ast-dump %s | FileCheck %s --check-prefix=DUM + +typedef struct { + int a; +} C; +#pragma omp declare mapper(C s) map(to : s.a) + +typedef struct { + int e; + C f; + int h; +} D; + +void foo() { + D sa[10]; + sa[1].e = 111; + sa[1].f.a = 222; + +#pragma omp target map(tofrom : sa) + { + sa[0].e = 333; + sa[1].f.a = 444; + } +} + +// DUM: -OMPDeclareMapperDecl{{.*}}<> +// DUM-NEXT: |-OMPMapClause {{.*}}<> +// DUM-NEXT: | |-MemberExpr {{.*}} 'int' lvalue .e +// DUM-NEXT: | | `-DeclRefExpr {{.*}}<> 'D' lvalue Var {{.*}} '_s' 'D' +// DUM-NEXT: | |-MemberExpr {{.*}} 'C' lvalue .f {{.*}} +// DUM-NEXT: | | `-DeclRefExpr {{.*}}<> 'D' lvalue Var {{.*}} '_s' 'D' +// DUM-NEXT: | `-MemberExpr {{.*}} 'int' lvalue .h {{.*}} +// DUM-NEXT: | `-DeclRefExpr {{.*}}<> 'D' lvalue Var {{.*}} '_s' 'D' +// DUM-NEXT: `-VarDecl {{.*}} col:1 implicit used _s 'D' diff --git a/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp new file mode 100644 index 0000000000000..5df1e958ad55a --- /dev/null +++ b/clang/test/OpenMP/target_map_array_of_structs_with_nested_mapper_codegen.cpp @@ -0,0 +1,323 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-globals --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "reduction_size[.].+[.]" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --global-value-regex "\.offload_.*" +// RUN: %clang_cc1 -verify -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -emit-llvm %s -o - | FileCheck %s +// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -std=c++11 -triple powerpc64le-unknown-unknown -emit-pch -o %t %s +// RUN: %clang_cc1 -fopenmp -fopenmp-targets=powerpc64le-ibm-linux-gnu -x c++ -triple powerpc64le-unknown-unknown -std=c++11 -include-pch %t -verify %s -emit-llvm -o - | FileCheck %s + +// expected-no-diagnostics +#ifndef HEADER +#define HEADER + +typedef struct { + int a; +} C; +#pragma omp declare mapper(C s) map(to : s.a) + +typedef struct { + int e; + C f; + int h; +} D; + +void foo() { + D sa[10]; + sa[1].e = 111; + sa[1].f.a = 222; + +#pragma omp target map(tofrom : sa) + { + sa[1].e = 333; + sa[1].f.a = 444; + } +} +#endif +//. +// CHECK: @.offload_sizes = private unnamed_addr constant [1 x i64] [i64 120] +// CHECK: @.offload_maptypes = private unnamed_addr constant [1 x i64] [i64 35] +//. +// CHECK-LABEL: define {{[^@]+}}@_Z3foov +// CHECK-SAME: () #[[ATTR0:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SA:%.*]] = alloca [10 x %struct.D], align 4 +// CHECK-NEXT: [[DOTOFFLOAD_BASEPTRS:%.*]] = alloca [1 x ptr], align 8 +// CHECK-NEXT: [[DOTOFFLOAD_PTRS:%.*]] = alloca [1 x ptr], align 8 +// CHECK-NEXT: [[DOTOFFLOAD_MAPPERS:%.*]] = alloca [1 x ptr], align 8 +// CHECK-NEXT: [[KERNEL_ARGS:%.*]] = alloca [[STRUCT___TGT_KERNEL_ARGUMENTS:%.*]], align 8 +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[SA]], i64 0, i64 1 +// CHECK-NEXT: [[E:%.*]] = getelementptr inbounds nuw [[STRUCT_D:%.*]], ptr [[ARRAYIDX]], i32 0, i32 0 +// CHECK-NEXT: store i32 111, ptr [[E]], align 4 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[SA]], i64 0, i64 1 +// CHECK-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[ARRAYIDX1]], i32 0, i32 1 +// CHECK-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C:%.*]], ptr [[F]], i32 0, i32 0 +// CHECK-NEXT: store i32 222, ptr [[A]], align 4 +// CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK-NEXT: store ptr [[SA]], ptr [[TMP0]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK-NEXT: store ptr [[SA]], ptr [[TMP1]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_MAPPERS]], i64 0, i64 0 +// CHECK-NEXT: store ptr @.omp_mapper._ZTS1D.default, ptr [[TMP2]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_BASEPTRS]], i32 0, i32 0 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds [1 x ptr], ptr [[DOTOFFLOAD_PTRS]], i32 0, i32 0 +// CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 0 +// CHECK-NEXT: store i32 3, ptr [[TMP5]], align 4 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 1 +// CHECK-NEXT: store i32 1, ptr [[TMP6]], align 4 +// CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 2 +// CHECK-NEXT: store ptr [[TMP3]], ptr [[TMP7]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 3 +// CHECK-NEXT: store ptr [[TMP4]], ptr [[TMP8]], align 8 +// CHECK-NEXT: [[TMP9:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 4 +// CHECK-NEXT: store ptr @.offload_sizes, ptr [[TMP9]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 5 +// CHECK-NEXT: store ptr @.offload_maptypes, ptr [[TMP10]], align 8 +// CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 6 +// CHECK-NEXT: store ptr null, ptr [[TMP11]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 7 +// CHECK-NEXT: store ptr [[DOTOFFLOAD_MAPPERS]], ptr [[TMP12]], align 8 +// CHECK-NEXT: [[TMP13:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 8 +// CHECK-NEXT: store i64 0, ptr [[TMP13]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 9 +// CHECK-NEXT: store i64 0, ptr [[TMP14]], align 8 +// CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 10 +// CHECK-NEXT: store [3 x i32] [i32 -1, i32 0, i32 0], ptr [[TMP15]], align 4 +// CHECK-NEXT: [[TMP16:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 11 +// CHECK-NEXT: store [3 x i32] zeroinitializer, ptr [[TMP16]], align 4 +// CHECK-NEXT: [[TMP17:%.*]] = getelementptr inbounds nuw [[STRUCT___TGT_KERNEL_ARGUMENTS]], ptr [[KERNEL_ARGS]], i32 0, i32 12 +// CHECK-NEXT: store i32 0, ptr [[TMP17]], align 4 +// CHECK-NEXT: [[TMP18:%.*]] = call i32 @__tgt_target_kernel(ptr @[[GLOB1:[0-9]+]], i64 -1, i32 -1, i32 0, ptr @.{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26.region_id, ptr [[KERNEL_ARGS]]) +// CHECK-NEXT: [[TMP19:%.*]] = icmp ne i32 [[TMP18]], 0 +// CHECK-NEXT: br i1 [[TMP19]], label [[OMP_OFFLOAD_FAILED:%.*]], label [[OMP_OFFLOAD_CONT:%.*]] +// CHECK: omp_offload.failed: +// CHECK-NEXT: call void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26(ptr [[SA]]) #[[ATTR3:[0-9]+]] +// CHECK-NEXT: br label [[OMP_OFFLOAD_CONT]] +// CHECK: omp_offload.cont: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}__Z3foov_l26 +// CHECK-SAME: (ptr noundef nonnull align 4 dereferenceable(120) [[SA:%.*]]) #[[ATTR1:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[SA_ADDR:%.*]] = alloca ptr, align 8 +// CHECK-NEXT: store ptr [[SA]], ptr [[SA_ADDR]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load ptr, ptr [[SA_ADDR]], align 8, !nonnull [[META5:![0-9]+]], !align [[META6:![0-9]+]] +// CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[TMP0]], i64 0, i64 1 +// CHECK-NEXT: [[E:%.*]] = getelementptr inbounds nuw [[STRUCT_D:%.*]], ptr [[ARRAYIDX]], i32 0, i32 0 +// CHECK-NEXT: store i32 333, ptr [[E]], align 4 +// CHECK-NEXT: [[ARRAYIDX1:%.*]] = getelementptr inbounds [10 x %struct.D], ptr [[TMP0]], i64 0, i64 1 +// CHECK-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[ARRAYIDX1]], i32 0, i32 1 +// CHECK-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C:%.*]], ptr [[F]], i32 0, i32 0 +// CHECK-NEXT: store i32 444, ptr [[A]], align 4 +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@.omp_mapper._ZTS1D.default +// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], i64 noundef [[TMP3:%.*]], i64 noundef [[TMP4:%.*]], ptr noundef [[TMP5:%.*]]) #[[ATTR2:[0-9]+]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP6:%.*]] = udiv exact i64 [[TMP3]], 12 +// CHECK-NEXT: [[TMP7:%.*]] = getelementptr [[STRUCT_D:%.*]], ptr [[TMP2]], i64 [[TMP6]] +// CHECK-NEXT: [[OMP_ARRAYINIT_ISARRAY:%.*]] = icmp sgt i64 [[TMP6]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = and i64 [[TMP4]], 8 +// CHECK-NEXT: [[TMP9:%.*]] = icmp ne ptr [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP10:%.*]] = and i64 [[TMP4]], 16 +// CHECK-NEXT: [[TMP11:%.*]] = icmp ne i64 [[TMP10]], 0 +// CHECK-NEXT: [[TMP12:%.*]] = and i1 [[TMP9]], [[TMP11]] +// CHECK-NEXT: [[TMP13:%.*]] = or i1 [[OMP_ARRAYINIT_ISARRAY]], [[TMP12]] +// CHECK-NEXT: [[DOTOMP_ARRAY__INIT__DELETE:%.*]] = icmp eq i64 [[TMP8]], 0 +// CHECK-NEXT: [[TMP14:%.*]] = and i1 [[TMP13]], [[DOTOMP_ARRAY__INIT__DELETE]] +// CHECK-NEXT: br i1 [[TMP14]], label [[DOTOMP_ARRAY__INIT:%.*]], label [[OMP_ARRAYMAP_HEAD:%.*]] +// CHECK: .omp.array..init: +// CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP6]], 12 +// CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP4]], -4 +// CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP16]], 512 +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP15]], i64 [[TMP17]], ptr [[TMP5]]) +// CHECK-NEXT: br label [[OMP_ARRAYMAP_HEAD]] +// CHECK: omp.arraymap.head: +// CHECK-NEXT: [[OMP_ARRAYMAP_ISEMPTY:%.*]] = icmp eq ptr [[TMP2]], [[TMP7]] +// CHECK-NEXT: br i1 [[OMP_ARRAYMAP_ISEMPTY]], label [[OMP_DONE:%.*]], label [[OMP_ARRAYMAP_BODY:%.*]] +// CHECK: omp.arraymap.body: +// CHECK-NEXT: [[OMP_ARRAYMAP_PTRCURRENT:%.*]] = phi ptr [ [[TMP2]], [[OMP_ARRAYMAP_HEAD]] ], [ [[OMP_ARRAYMAP_NEXT:%.*]], [[OMP_TYPE_END20:%.*]] ] +// CHECK-NEXT: [[E:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 0 +// CHECK-NEXT: [[F:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 1 +// CHECK-NEXT: [[H:%.*]] = getelementptr inbounds nuw [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 2 +// CHECK-NEXT: [[TMP18:%.*]] = getelementptr i32, ptr [[H]], i32 1 +// CHECK-NEXT: [[TMP19:%.*]] = ptrtoint ptr [[TMP18]] to i64 +// CHECK-NEXT: [[TMP20:%.*]] = ptrtoint ptr [[E]] to i64 +// CHECK-NEXT: [[TMP21:%.*]] = sub i64 [[TMP19]], [[TMP20]] +// CHECK-NEXT: [[TMP22:%.*]] = sdiv exact i64 [[TMP21]], ptrtoint (ptr getelementptr (i8, ptr null, i32 1) to i64) +// CHECK-NEXT: [[TMP23:%.*]] = call i64 @__tgt_mapper_num_components(ptr [[TMP0]]) +// CHECK-NEXT: [[TMP24:%.*]] = shl i64 [[TMP23]], 48 +// CHECK-NEXT: [[TMP25:%.*]] = add nuw i64 0, [[TMP24]] +// CHECK-NEXT: [[TMP26:%.*]] = and i64 [[TMP4]], 3 +// CHECK-NEXT: [[TMP27:%.*]] = icmp eq i64 [[TMP26]], 0 +// CHECK-NEXT: br i1 [[TMP27]], label [[OMP_TYPE_ALLOC:%.*]], label [[OMP_TYPE_ALLOC_ELSE:%.*]] +// CHECK: omp.type.alloc: +// CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP25]], -4 +// CHECK-NEXT: br label [[OMP_TYPE_END:%.*]] +// CHECK: omp.type.alloc.else: +// CHECK-NEXT: [[TMP29:%.*]] = icmp eq i64 [[TMP26]], 1 +// CHECK-NEXT: br i1 [[TMP29]], label [[OMP_TYPE_TO:%.*]], label [[OMP_TYPE_TO_ELSE:%.*]] +// CHECK: omp.type.to: +// CHECK-NEXT: [[TMP30:%.*]] = and i64 [[TMP25]], -3 +// CHECK-NEXT: br label [[OMP_TYPE_END]] +// CHECK: omp.type.to.else: +// CHECK-NEXT: [[TMP31:%.*]] = icmp eq i64 [[TMP26]], 2 +// CHECK-NEXT: br i1 [[TMP31]], label [[OMP_TYPE_FROM:%.*]], label [[OMP_TYPE_END]] +// CHECK: omp.type.from: +// CHECK-NEXT: [[TMP32:%.*]] = and i64 [[TMP25]], -2 +// CHECK-NEXT: br label [[OMP_TYPE_END]] +// CHECK: omp.type.end: +// CHECK-NEXT: [[OMP_MAPTYPE:%.*]] = phi i64 [ [[TMP28]], [[OMP_TYPE_ALLOC]] ], [ [[TMP30]], [[OMP_TYPE_TO]] ], [ [[TMP32]], [[OMP_TYPE_FROM]] ], [ [[TMP25]], [[OMP_TYPE_TO_ELSE]] ] +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[E]], i64 [[TMP22]], i64 [[OMP_MAPTYPE]], ptr null) +// CHECK-NEXT: [[TMP33:%.*]] = add nuw i64 281474976711171, [[TMP24]] +// CHECK-NEXT: [[TMP34:%.*]] = and i64 [[TMP4]], 3 +// CHECK-NEXT: [[TMP35:%.*]] = icmp eq i64 [[TMP34]], 0 +// CHECK-NEXT: br i1 [[TMP35]], label [[OMP_TYPE_ALLOC1:%.*]], label [[OMP_TYPE_ALLOC_ELSE2:%.*]] +// CHECK: omp.type.alloc1: +// CHECK-NEXT: [[TMP36:%.*]] = and i64 [[TMP33]], -4 +// CHECK-NEXT: br label [[OMP_TYPE_END6:%.*]] +// CHECK: omp.type.alloc.else2: +// CHECK-NEXT: [[TMP37:%.*]] = icmp eq i64 [[TMP34]], 1 +// CHECK-NEXT: br i1 [[TMP37]], label [[OMP_TYPE_TO3:%.*]], label [[OMP_TYPE_TO_ELSE4:%.*]] +// CHECK: omp.type.to3: +// CHECK-NEXT: [[TMP38:%.*]] = and i64 [[TMP33]], -3 +// CHECK-NEXT: br label [[OMP_TYPE_END6]] +// CHECK: omp.type.to.else4: +// CHECK-NEXT: [[TMP39:%.*]] = icmp eq i64 [[TMP34]], 2 +// CHECK-NEXT: br i1 [[TMP39]], label [[OMP_TYPE_FROM5:%.*]], label [[OMP_TYPE_END6]] +// CHECK: omp.type.from5: +// CHECK-NEXT: [[TMP40:%.*]] = and i64 [[TMP33]], -2 +// CHECK-NEXT: br label [[OMP_TYPE_END6]] +// CHECK: omp.type.end6: +// CHECK-NEXT: [[OMP_MAPTYPE7:%.*]] = phi i64 [ [[TMP36]], [[OMP_TYPE_ALLOC1]] ], [ [[TMP38]], [[OMP_TYPE_TO3]] ], [ [[TMP40]], [[OMP_TYPE_FROM5]] ], [ [[TMP33]], [[OMP_TYPE_TO_ELSE4]] ] +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[E]], i64 4, i64 [[OMP_MAPTYPE7]], ptr null) +// CHECK-NEXT: [[TMP41:%.*]] = add nuw i64 281474976711171, [[TMP24]] +// CHECK-NEXT: [[TMP42:%.*]] = and i64 [[TMP4]], 3 +// CHECK-NEXT: [[TMP43:%.*]] = icmp eq i64 [[TMP42]], 0 +// CHECK-NEXT: br i1 [[TMP43]], label [[OMP_TYPE_ALLOC8:%.*]], label [[OMP_TYPE_ALLOC_ELSE9:%.*]] +// CHECK: omp.type.alloc8: +// CHECK-NEXT: [[TMP44:%.*]] = and i64 [[TMP41]], -4 +// CHECK-NEXT: br label [[OMP_TYPE_END13:%.*]] +// CHECK: omp.type.alloc.else9: +// CHECK-NEXT: [[TMP45:%.*]] = icmp eq i64 [[TMP42]], 1 +// CHECK-NEXT: br i1 [[TMP45]], label [[OMP_TYPE_TO10:%.*]], label [[OMP_TYPE_TO_ELSE11:%.*]] +// CHECK: omp.type.to10: +// CHECK-NEXT: [[TMP46:%.*]] = and i64 [[TMP41]], -3 +// CHECK-NEXT: br label [[OMP_TYPE_END13]] +// CHECK: omp.type.to.else11: +// CHECK-NEXT: [[TMP47:%.*]] = icmp eq i64 [[TMP42]], 2 +// CHECK-NEXT: br i1 [[TMP47]], label [[OMP_TYPE_FROM12:%.*]], label [[OMP_TYPE_END13]] +// CHECK: omp.type.from12: +// CHECK-NEXT: [[TMP48:%.*]] = and i64 [[TMP41]], -2 +// CHECK-NEXT: br label [[OMP_TYPE_END13]] +// CHECK: omp.type.end13: +// CHECK-NEXT: [[OMP_MAPTYPE14:%.*]] = phi i64 [ [[TMP44]], [[OMP_TYPE_ALLOC8]] ], [ [[TMP46]], [[OMP_TYPE_TO10]] ], [ [[TMP48]], [[OMP_TYPE_FROM12]] ], [ [[TMP41]], [[OMP_TYPE_TO_ELSE11]] ] +// CHECK-NEXT: call void @.omp_mapper._ZTS1C.default(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[F]], i64 4, i64 [[OMP_MAPTYPE14]], ptr null) #[[ATTR3]] +// CHECK-NEXT: [[TMP49:%.*]] = add nuw i64 281474976711171, [[TMP24]] +// CHECK-NEXT: [[TMP50:%.*]] = and i64 [[TMP4]], 3 +// CHECK-NEXT: [[TMP51:%.*]] = icmp eq i64 [[TMP50]], 0 +// CHECK-NEXT: br i1 [[TMP51]], label [[OMP_TYPE_ALLOC15:%.*]], label [[OMP_TYPE_ALLOC_ELSE16:%.*]] +// CHECK: omp.type.alloc15: +// CHECK-NEXT: [[TMP52:%.*]] = and i64 [[TMP49]], -4 +// CHECK-NEXT: br label [[OMP_TYPE_END20]] +// CHECK: omp.type.alloc.else16: +// CHECK-NEXT: [[TMP53:%.*]] = icmp eq i64 [[TMP50]], 1 +// CHECK-NEXT: br i1 [[TMP53]], label [[OMP_TYPE_TO17:%.*]], label [[OMP_TYPE_TO_ELSE18:%.*]] +// CHECK: omp.type.to17: +// CHECK-NEXT: [[TMP54:%.*]] = and i64 [[TMP49]], -3 +// CHECK-NEXT: br label [[OMP_TYPE_END20]] +// CHECK: omp.type.to.else18: +// CHECK-NEXT: [[TMP55:%.*]] = icmp eq i64 [[TMP50]], 2 +// CHECK-NEXT: br i1 [[TMP55]], label [[OMP_TYPE_FROM19:%.*]], label [[OMP_TYPE_END20]] +// CHECK: omp.type.from19: +// CHECK-NEXT: [[TMP56:%.*]] = and i64 [[TMP49]], -2 +// CHECK-NEXT: br label [[OMP_TYPE_END20]] +// CHECK: omp.type.end20: +// CHECK-NEXT: [[OMP_MAPTYPE21:%.*]] = phi i64 [ [[TMP52]], [[OMP_TYPE_ALLOC15]] ], [ [[TMP54]], [[OMP_TYPE_TO17]] ], [ [[TMP56]], [[OMP_TYPE_FROM19]] ], [ [[TMP49]], [[OMP_TYPE_TO_ELSE18]] ] +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[H]], i64 4, i64 [[OMP_MAPTYPE21]], ptr null) +// CHECK-NEXT: [[OMP_ARRAYMAP_NEXT]] = getelementptr [[STRUCT_D]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 1 +// CHECK-NEXT: [[OMP_ARRAYMAP_ISDONE:%.*]] = icmp eq ptr [[OMP_ARRAYMAP_NEXT]], [[TMP7]] +// CHECK-NEXT: br i1 [[OMP_ARRAYMAP_ISDONE]], label [[OMP_ARRAYMAP_EXIT:%.*]], label [[OMP_ARRAYMAP_BODY]] +// CHECK: omp.arraymap.exit: +// CHECK-NEXT: [[OMP_ARRAYINIT_ISARRAY22:%.*]] = icmp sgt i64 [[TMP6]], 1 +// CHECK-NEXT: [[TMP57:%.*]] = and i64 [[TMP4]], 8 +// CHECK-NEXT: [[DOTOMP_ARRAY__DEL__DELETE:%.*]] = icmp ne i64 [[TMP57]], 0 +// CHECK-NEXT: [[TMP58:%.*]] = and i1 [[OMP_ARRAYINIT_ISARRAY22]], [[DOTOMP_ARRAY__DEL__DELETE]] +// CHECK-NEXT: br i1 [[TMP58]], label [[DOTOMP_ARRAY__DEL:%.*]], label [[OMP_DONE]] +// CHECK: .omp.array..del: +// CHECK-NEXT: [[TMP59:%.*]] = mul nuw i64 [[TMP6]], 12 +// CHECK-NEXT: [[TMP60:%.*]] = and i64 [[TMP4]], -4 +// CHECK-NEXT: [[TMP61:%.*]] = or i64 [[TMP60]], 512 +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP59]], i64 [[TMP61]], ptr [[TMP5]]) +// CHECK-NEXT: br label [[OMP_DONE]] +// CHECK: omp.done: +// CHECK-NEXT: ret void +// +// +// CHECK-LABEL: define {{[^@]+}}@.omp_mapper._ZTS1C.default +// CHECK-SAME: (ptr noundef [[TMP0:%.*]], ptr noundef [[TMP1:%.*]], ptr noundef [[TMP2:%.*]], i64 noundef [[TMP3:%.*]], i64 noundef [[TMP4:%.*]], ptr noundef [[TMP5:%.*]]) #[[ATTR2]] { +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP6:%.*]] = udiv exact i64 [[TMP3]], 4 +// CHECK-NEXT: [[TMP7:%.*]] = getelementptr [[STRUCT_C:%.*]], ptr [[TMP2]], i64 [[TMP6]] +// CHECK-NEXT: [[OMP_ARRAYINIT_ISARRAY:%.*]] = icmp sgt i64 [[TMP6]], 1 +// CHECK-NEXT: [[TMP8:%.*]] = and i64 [[TMP4]], 8 +// CHECK-NEXT: [[TMP9:%.*]] = icmp ne ptr [[TMP1]], [[TMP2]] +// CHECK-NEXT: [[TMP10:%.*]] = and i64 [[TMP4]], 16 +// CHECK-NEXT: [[TMP11:%.*]] = icmp ne i64 [[TMP10]], 0 +// CHECK-NEXT: [[TMP12:%.*]] = and i1 [[TMP9]], [[TMP11]] +// CHECK-NEXT: [[TMP13:%.*]] = or i1 [[OMP_ARRAYINIT_ISARRAY]], [[TMP12]] +// CHECK-NEXT: [[DOTOMP_ARRAY__INIT__DELETE:%.*]] = icmp eq i64 [[TMP8]], 0 +// CHECK-NEXT: [[TMP14:%.*]] = and i1 [[TMP13]], [[DOTOMP_ARRAY__INIT__DELETE]] +// CHECK-NEXT: br i1 [[TMP14]], label [[DOTOMP_ARRAY__INIT:%.*]], label [[OMP_ARRAYMAP_HEAD:%.*]] +// CHECK: .omp.array..init: +// CHECK-NEXT: [[TMP15:%.*]] = mul nuw i64 [[TMP6]], 4 +// CHECK-NEXT: [[TMP16:%.*]] = and i64 [[TMP4]], -4 +// CHECK-NEXT: [[TMP17:%.*]] = or i64 [[TMP16]], 512 +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP15]], i64 [[TMP17]], ptr [[TMP5]]) +// CHECK-NEXT: br label [[OMP_ARRAYMAP_HEAD]] +// CHECK: omp.arraymap.head: +// CHECK-NEXT: [[OMP_ARRAYMAP_ISEMPTY:%.*]] = icmp eq ptr [[TMP2]], [[TMP7]] +// CHECK-NEXT: br i1 [[OMP_ARRAYMAP_ISEMPTY]], label [[OMP_DONE:%.*]], label [[OMP_ARRAYMAP_BODY:%.*]] +// CHECK: omp.arraymap.body: +// CHECK-NEXT: [[OMP_ARRAYMAP_PTRCURRENT:%.*]] = phi ptr [ [[TMP2]], [[OMP_ARRAYMAP_HEAD]] ], [ [[OMP_ARRAYMAP_NEXT:%.*]], [[OMP_TYPE_END:%.*]] ] +// CHECK-NEXT: [[A:%.*]] = getelementptr inbounds nuw [[STRUCT_C]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 0, i32 0 +// CHECK-NEXT: [[TMP18:%.*]] = call i64 @__tgt_mapper_num_components(ptr [[TMP0]]) +// CHECK-NEXT: [[TMP19:%.*]] = shl i64 [[TMP18]], 48 +// CHECK-NEXT: [[TMP20:%.*]] = add nuw i64 1, [[TMP19]] +// CHECK-NEXT: [[TMP21:%.*]] = and i64 [[TMP4]], 3 +// CHECK-NEXT: [[TMP22:%.*]] = icmp eq i64 [[TMP21]], 0 +// CHECK-NEXT: br i1 [[TMP22]], label [[OMP_TYPE_ALLOC:%.*]], label [[OMP_TYPE_ALLOC_ELSE:%.*]] +// CHECK: omp.type.alloc: +// CHECK-NEXT: [[TMP23:%.*]] = and i64 [[TMP20]], -4 +// CHECK-NEXT: br label [[OMP_TYPE_END]] +// CHECK: omp.type.alloc.else: +// CHECK-NEXT: [[TMP24:%.*]] = icmp eq i64 [[TMP21]], 1 +// CHECK-NEXT: br i1 [[TMP24]], label [[OMP_TYPE_TO:%.*]], label [[OMP_TYPE_TO_ELSE:%.*]] +// CHECK: omp.type.to: +// CHECK-NEXT: [[TMP25:%.*]] = and i64 [[TMP20]], -3 +// CHECK-NEXT: br label [[OMP_TYPE_END]] +// CHECK: omp.type.to.else: +// CHECK-NEXT: [[TMP26:%.*]] = icmp eq i64 [[TMP21]], 2 +// CHECK-NEXT: br i1 [[TMP26]], label [[OMP_TYPE_FROM:%.*]], label [[OMP_TYPE_END]] +// CHECK: omp.type.from: +// CHECK-NEXT: [[TMP27:%.*]] = and i64 [[TMP20]], -2 +// CHECK-NEXT: br label [[OMP_TYPE_END]] +// CHECK: omp.type.end: +// CHECK-NEXT: [[OMP_MAPTYPE:%.*]] = phi i64 [ [[TMP23]], [[OMP_TYPE_ALLOC]] ], [ [[TMP25]], [[OMP_TYPE_TO]] ], [ [[TMP27]], [[OMP_TYPE_FROM]] ], [ [[TMP20]], [[OMP_TYPE_TO_ELSE]] ] +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], ptr [[A]], i64 4, i64 [[OMP_MAPTYPE]], ptr null) +// CHECK-NEXT: [[OMP_ARRAYMAP_NEXT]] = getelementptr [[STRUCT_C]], ptr [[OMP_ARRAYMAP_PTRCURRENT]], i32 1 +// CHECK-NEXT: [[OMP_ARRAYMAP_ISDONE:%.*]] = icmp eq ptr [[OMP_ARRAYMAP_NEXT]], [[TMP7]] +// CHECK-NEXT: br i1 [[OMP_ARRAYMAP_ISDONE]], label [[OMP_ARRAYMAP_EXIT:%.*]], label [[OMP_ARRAYMAP_BODY]] +// CHECK: omp.arraymap.exit: +// CHECK-NEXT: [[OMP_ARRAYINIT_ISARRAY1:%.*]] = icmp sgt i64 [[TMP6]], 1 +// CHECK-NEXT: [[TMP28:%.*]] = and i64 [[TMP4]], 8 +// CHECK-NEXT: [[DOTOMP_ARRAY__DEL__DELETE:%.*]] = icmp ne i64 [[TMP28]], 0 +// CHECK-NEXT: [[TMP29:%.*]] = and i1 [[OMP_ARRAYINIT_ISARRAY1]], [[DOTOMP_ARRAY__DEL__DELETE]] +// CHECK-NEXT: br i1 [[TMP29]], label [[DOTOMP_ARRAY__DEL:%.*]], label [[OMP_DONE]] +// CHECK: .omp.array..del: +// CHECK-NEXT: [[TMP30:%.*]] = mul nuw i64 [[TMP6]], 4 +// CHECK-NEXT: [[TMP31:%.*]] = and i64 [[TMP4]], -4 +// CHECK-NEXT: [[TMP32:%.*]] = or i64 [[TMP31]], 512 +// CHECK-NEXT: call void @__tgt_push_mapper_component(ptr [[TMP0]], ptr [[TMP1]], ptr [[TMP2]], i64 [[TMP30]], i64 [[TMP32]], ptr [[TMP5]]) +// CHECK-NEXT: br label [[OMP_DONE]] +// CHECK: omp.done: +// CHECK-NEXT: ret void +// diff --git a/clang/test/OpenMP/target_map_nest_defalut_mapper_ast_dump.cpp b/clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_ast_dump.cpp similarity index 100% rename from clang/test/OpenMP/target_map_nest_defalut_mapper_ast_dump.cpp rename to clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_ast_dump.cpp diff --git a/clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp b/clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_codegen.cpp similarity index 100% rename from clang/test/OpenMP/target_map_nest_defalut_mapper_codegen.cpp rename to clang/test/OpenMP/target_map_array_section_of_structs_with_nested_mapper_codegen.cpp diff --git a/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp b/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp index d545e98ef6c3e..93695d1b388ff 100644 --- a/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp +++ b/offload/test/mapping/declare_mapper_nested_default_mappers_array.cpp @@ -4,8 +4,6 @@ // RUN: %libomptarget-compilexx-run-and-check-x86_64-unknown-linux-gnu // RUN: %libomptarget-compilexx-run-and-check-nvptx64-nvidia-cuda -// UNSUPPORTED: clang - #include #include @@ -50,7 +48,7 @@ int main() { sa[1].h = N; printf("%d %d %d %4.5f %d\n", sa[1].e, sa[1].f.a, sa[1].f.c.a, sa[1].f.b[1], - sa[1].f.b == &x[0] ? 1 : 0); + sa[1].f.b == &y[0] ? 1 : 0); // CHECK: 111 222 777 20.00000 1 __intptr_t p = reinterpret_cast<__intptr_t>(&y[0]); @@ -65,6 +63,6 @@ int main() { sa[1].f.b[1] = 40; } printf("%d %d %d %4.5f %d\n", sa[1].e, sa[1].f.a, sa[1].f.c.a, sa[1].f.b[1], - sa[1].f.b == &x[0] ? 1 : 0); + sa[1].f.b == &y[0] ? 1 : 0); // CHECK: 333 222 777 40.00000 1 } From 574f77a1ee34461bc1f4a0823da6c960ff1c9655 Mon Sep 17 00:00:00 2001 From: Erich Keane Date: Wed, 11 Jun 2025 12:04:26 -0700 Subject: [PATCH 127/851] [OpenACC][CIR] Add parallelism determ. to all acc.loops (#143751) PR #143720 adds a requirement to the ACC dialect that every acc.loop must have a seq, independent, or auto attribute for the 'default' device_type. The standard has rules for how this can be intuited: orphan/parallel/parallel loop: independent kernels/kernels loop: auto serial/serial loop: seq, unless there is a gang/worker/vector, at which point it should be 'auto'. This patch implements all of this rule as a 'cleanup' step on the IR generation for combined/loop operations. Note that the test impact is much less since I inadvertently have my 'operation' terminating curley matching the end curley from 'attribute' instead of the front of the line, so I've added sufficient tests to ensure I captured the above. --- clang/lib/CIR/CodeGen/CIRGenFunction.h | 12 +++ clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp | 2 + .../lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp | 33 ++++++ clang/test/CIR/CodeGenOpenACC/combined.cpp | 69 ++++++++++-- clang/test/CIR/CodeGenOpenACC/loop.cpp | 101 ++++++++++++++++-- .../mlir/Dialect/OpenACC/OpenACCOps.td | 8 ++ mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 24 +++++ 7 files changed, 232 insertions(+), 17 deletions(-) diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index b08dd540e6289..682d59d63faa8 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -34,6 +34,12 @@ namespace { class ScalarExprEmitter; } // namespace +namespace mlir { +namespace acc { +class LoopOp; +} // namespace acc +} // namespace mlir + namespace clang::CIRGen { class CIRGenFunction : public CIRGenTypeCache { @@ -1082,6 +1088,12 @@ class CIRGenFunction : public CIRGenTypeCache { OpenACCDirectiveKind dirKind, SourceLocation dirLoc, ArrayRef clauses); + // The OpenACC LoopOp requires that we have auto, seq, or independent on all + // LoopOp operations for the 'none' device type case. This function checks if + // the LoopOp has one, else it updates it to have one. + void updateLoopOpParallelism(mlir::acc::LoopOp &op, bool isOrphan, + OpenACCDirectiveKind dk); + public: mlir::LogicalResult emitOpenACCComputeConstruct(const OpenACCComputeConstruct &s); diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp index 2aab9cecf93d8..1feefa55eb270 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp @@ -102,6 +102,8 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct( emitOpenACCClauses(computeOp, loopOp, dirKind, dirLoc, clauses); + updateLoopOpParallelism(loopOp, /*isOrphan=*/false, dirKind); + builder.create(end); } diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp index 24cd1d399de65..71f3ccb8e040e 100644 --- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp @@ -22,6 +22,36 @@ using namespace clang::CIRGen; using namespace cir; using namespace mlir::acc; +void CIRGenFunction::updateLoopOpParallelism(mlir::acc::LoopOp &op, + bool isOrphan, + OpenACCDirectiveKind dk) { + // Check that at least one of auto, independent, or seq is present + // for the device-independent default clauses. + if (op.hasParallelismFlag(mlir::acc::DeviceType::None)) + return; + + switch (dk) { + default: + llvm_unreachable("Invalid parent directive kind"); + case OpenACCDirectiveKind::Invalid: + case OpenACCDirectiveKind::Parallel: + case OpenACCDirectiveKind::ParallelLoop: + op.addIndependent(builder.getContext(), {}); + return; + case OpenACCDirectiveKind::Kernels: + case OpenACCDirectiveKind::KernelsLoop: + op.addAuto(builder.getContext(), {}); + return; + case OpenACCDirectiveKind::Serial: + case OpenACCDirectiveKind::SerialLoop: + if (op.hasDefaultGangWorkerVector()) + op.addAuto(builder.getContext(), {}); + else + op.addSeq(builder.getContext(), {}); + return; + }; +} + mlir::LogicalResult CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) { mlir::Location start = getLoc(s.getSourceRange().getBegin()); @@ -90,6 +120,9 @@ CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) { emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(), s.clauses()); + updateLoopOpParallelism(op, s.isOrphanedLoopConstruct(), + s.getParentComputeConstructKind()); + mlir::LogicalResult stmtRes = mlir::success(); // Emit body. { diff --git a/clang/test/CIR/CodeGenOpenACC/combined.cpp b/clang/test/CIR/CodeGenOpenACC/combined.cpp index 1f3c9f1a8d3fa..5b83a9cb91898 100644 --- a/clang/test/CIR/CodeGenOpenACC/combined.cpp +++ b/clang/test/CIR/CodeGenOpenACC/combined.cpp @@ -74,7 +74,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.serial combined(loop) { // CHECK: acc.loop combined(serial) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {seq = [#acc.device_type, #acc.device_type]} loc + // CHECK-NEXT: } attributes {seq = [#acc.device_type, #acc.device_type, #acc.device_type]} loc // CHECK: acc.yield // CHECK-NEXT: } loc #pragma acc kernels loop seq device_type(nvidia, radeon) @@ -99,7 +99,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.serial combined(loop) { // CHECK: acc.loop combined(serial) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {auto_ = [#acc.device_type, #acc.device_type]} loc + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type, #acc.device_type], seq = [#acc.device_type]} loc // CHECK: acc.yield // CHECK-NEXT: } loc #pragma acc kernels loop auto device_type(nvidia, radeon) @@ -124,7 +124,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.serial combined(loop) { // CHECK: acc.loop combined(serial) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {independent = [#acc.device_type, #acc.device_type]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type, #acc.device_type], seq = [#acc.device_type]} loc // CHECK: acc.yield // CHECK-NEXT: } loc #pragma acc kernels loop independent device_type(nvidia, radeon) @@ -143,7 +143,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.parallel combined(loop) { // CHECK: acc.loop combined(parallel) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type]} + // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type], independent = [#acc.device_type]} // CHECK: acc.yield // CHECK-NEXT: } loc @@ -154,7 +154,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.serial combined(loop) { // CHECK: acc.loop combined(serial) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type, #acc.device_type]} + // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type, #acc.device_type], seq = [#acc.device_type]} // CHECK: acc.yield // CHECK-NEXT: } loc @@ -165,7 +165,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.kernels combined(loop) { // CHECK: acc.loop combined(kernels) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type]} + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type], collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type]} // CHECK: acc.terminator // CHECK-NEXT: } loc #pragma acc parallel loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3) @@ -175,7 +175,7 @@ extern "C" void acc_combined(int N, int cond) { // CHECK: acc.parallel combined(loop) { // CHECK: acc.loop combined(parallel) { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type, #acc.device_type]} + // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type, #acc.device_type], independent = [#acc.device_type]} // CHECK: acc.yield // CHECK-NEXT: } loc @@ -1184,4 +1184,59 @@ extern "C" void acc_combined_data_clauses(int *arg1, int *arg2) { // CHECK-NEXT: } loc // CHECK-NEXT: acc.detach accPtr(%[[ATTACH2]] : !cir.ptr>) async([#acc.device_type]) {dataClause = #acc, name = "arg2"} // CHECK-NEXT: acc.detach accPtr(%[[ATTACH1]] : !cir.ptr>) async([#acc.device_type]) {dataClause = #acc, name = "arg1"} + + // Checking the automatic-addition of parallelism clauses. +#pragma acc parallel loop + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.parallel combined(loop) { + // CHECK-NEXT: acc.loop combined(parallel) { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {independent = [#acc.device_type]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc kernels loop + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.kernels combined(loop) { + // CHECK-NEXT: acc.loop combined(kernels) { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc serial loop + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.serial combined(loop) { + // CHECK-NEXT: acc.loop combined(serial) { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {seq = [#acc.device_type]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial loop worker + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.serial combined(loop) { + // CHECK-NEXT: acc.loop combined(serial) worker { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial loop vector + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.serial combined(loop) { + // CHECK-NEXT: acc.loop combined(serial) vector { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial loop gang + for(unsigned I = 0; I < 5; ++I); + // CHECK-NEXT: acc.serial combined(loop) { + // CHECK-NEXT: acc.loop combined(serial) gang { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc } diff --git a/clang/test/CIR/CodeGenOpenACC/loop.cpp b/clang/test/CIR/CodeGenOpenACC/loop.cpp index db94e2819b301..c0bf11e353951 100644 --- a/clang/test/CIR/CodeGenOpenACC/loop.cpp +++ b/clang/test/CIR/CodeGenOpenACC/loop.cpp @@ -41,12 +41,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {seq = [#acc.device_type, #acc.device_type]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type], seq = [#acc.device_type, #acc.device_type]} loc #pragma acc loop device_type(radeon) seq for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {seq = [#acc.device_type]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type], seq = [#acc.device_type]} loc #pragma acc loop seq device_type(nvidia, radeon) for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { @@ -67,12 +67,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {independent = [#acc.device_type, #acc.device_type]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type, #acc.device_type, #acc.device_type]} loc #pragma acc loop device_type(radeon) independent for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {independent = [#acc.device_type]} loc + // CHECK-NEXT: } attributes {independent = [#acc.device_type, #acc.device_type]} loc #pragma acc loop independent device_type(nvidia, radeon) for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { @@ -93,12 +93,12 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {auto_ = [#acc.device_type, #acc.device_type]} loc + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type, #acc.device_type], independent = [#acc.device_type]} loc #pragma acc loop device_type(radeon) auto for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type], independent = [#acc.device_type]} loc #pragma acc loop auto device_type(nvidia, radeon) for(unsigned I = 0; I < N; ++I); // CHECK: acc.loop { @@ -116,7 +116,7 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned K = 0; K < N; ++K); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type]} + // CHECK-NEXT: } attributes {collapse = [1], collapseDeviceType = [#acc.device_type], independent = [#acc.device_type]} #pragma acc loop collapse(1) device_type(radeon) collapse (2) for(unsigned I = 0; I < N; ++I) @@ -124,7 +124,7 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned K = 0; K < N; ++K); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type, #acc.device_type]} + // CHECK-NEXT: } attributes {collapse = [1, 2], collapseDeviceType = [#acc.device_type, #acc.device_type], independent = [#acc.device_type]} #pragma acc loop collapse(1) device_type(radeon, nvidia) collapse (2) for(unsigned I = 0; I < N; ++I) @@ -132,14 +132,14 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { for(unsigned K = 0; K < N; ++K); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type]} + // CHECK-NEXT: } attributes {collapse = [1, 2, 2], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type], independent = [#acc.device_type]} #pragma acc loop collapse(1) device_type(radeon, nvidia) collapse(2) device_type(host) collapse(3) for(unsigned I = 0; I < N; ++I) for(unsigned J = 0; J < N; ++J) for(unsigned K = 0; K < N; ++K); // CHECK: acc.loop { // CHECK: acc.yield - // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type, #acc.device_type]} + // CHECK-NEXT: } attributes {collapse = [1, 2, 2, 3], collapseDeviceType = [#acc.device_type, #acc.device_type, #acc.device_type, #acc.device_type], independent = [#acc.device_type]} #pragma acc loop tile(1, 2, 3) for(unsigned I = 0; I < N; ++I) @@ -392,4 +392,85 @@ extern "C" void acc_loop(int *A, int *B, int *C, int N) { // CHECK: acc.yield // CHECK-NEXT: } loc } + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + + // Checking the automatic-addition of parallelism clauses. +#pragma acc loop + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {independent = [#acc.device_type]} loc + +#pragma acc parallel + { + // CHECK-NEXT: acc.parallel { +#pragma acc loop + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {independent = [#acc.device_type]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc kernels + { + // CHECK-NEXT: acc.kernels { +#pragma acc loop + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + } + // CHECK-NEXT: acc.terminator + // CHECK-NEXT: } loc + +#pragma acc serial + { + // CHECK-NEXT: acc.serial { +#pragma acc loop + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {seq = [#acc.device_type]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial + { + // CHECK-NEXT: acc.serial { +#pragma acc loop worker + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop worker { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial + { + // CHECK-NEXT: acc.serial { +#pragma acc loop vector + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop vector { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc + +#pragma acc serial + { + // CHECK-NEXT: acc.serial { +#pragma acc loop gang + for(unsigned I = 0; I < N; ++I); + // CHECK-NEXT: acc.loop gang { + // CHECK: acc.yield + // CHECK-NEXT: } attributes {auto_ = [#acc.device_type]} loc + } + // CHECK-NEXT: acc.yield + // CHECK-NEXT: } loc } diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td index 083a18d80704e..34312655115a1 100644 --- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td +++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td @@ -2246,6 +2246,14 @@ def OpenACC_LoopOp : OpenACC_Op<"loop", // device_types. This is for the case where there is no expression specified // in a 'gang'. void addEmptyGang(MLIRContext *, llvm::ArrayRef); + + // Return whether this LoopOp has an auto, seq, or independent for the + // specified device-type. + bool hasParallelismFlag(DeviceType); + + // Return whether this LoopOp has a gang, worker, or vector applying to the + // 'default'/None device-type. + bool hasDefaultGangWorkerVector(); }]; let hasCustomAssemblyFormat = 1; diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index c72ec47be9f04..21e6b9d85f1a1 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -2839,6 +2839,30 @@ void acc::LoopOp::addEmptyGang( effectiveDeviceTypes)); } +bool acc::LoopOp::hasParallelismFlag(DeviceType dt) { + auto hasDevice = [=](DeviceTypeAttr attr) -> bool { + return attr.getValue() == dt; + }; + auto testFromArr = [=](ArrayAttr arr) -> bool { + return llvm::any_of(arr.getAsRange(), hasDevice); + }; + + if (ArrayAttr arr = getSeqAttr(); arr && testFromArr(arr)) + return true; + if (ArrayAttr arr = getIndependentAttr(); arr && testFromArr(arr)) + return true; + if (ArrayAttr arr = getAuto_Attr(); arr && testFromArr(arr)) + return true; + + return false; +} + +bool acc::LoopOp::hasDefaultGangWorkerVector() { + return hasVector() || getVectorValue() || hasWorker() || getWorkerValue() || + hasGang() || getGangValue(GangArgType::Num) || + getGangValue(GangArgType::Dim) || getGangValue(GangArgType::Static); +} + void acc::LoopOp::addGangOperands( MLIRContext *context, llvm::ArrayRef effectiveDeviceTypes, llvm::ArrayRef argTypes, mlir::ValueRange values) { From d5f68cb145059fc6d2944e1d17ef561e183ade83 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 11 Jun 2025 12:09:44 -0700 Subject: [PATCH 128/851] [bazel] Port fe7bf4b90b1a835418bddd2b2aa63b4977a9f6d2 --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index f6a7cd7dea85b..7bcb1d4ca883c 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -6881,8 +6881,8 @@ cc_library( deps = [ ":SPIRVDialect", ":Support", - "//llvm:config", "//llvm:Support", + "//llvm:config", ], ) @@ -11249,7 +11249,7 @@ td_library( ) gentbl_cc_library( - name = "TransformDialectEnumsIncGen", + name = "TransformAttrsIncGen", tbl_outs = { "include/mlir/Dialect/Transform/IR/TransformDialectEnums.h.inc": [ "-gen-enum-decls", @@ -11257,6 +11257,12 @@ gentbl_cc_library( "include/mlir/Dialect/Transform/IR/TransformDialectEnums.cpp.inc": [ "-gen-enum-defs", ], + "include/mlir/Dialect/Transform/IR/TransformAttrs.h.inc": [ + "-gen-attrdef-decls", + ], + "include/mlir/Dialect/Transform/IR/TransformAttrs.cpp.inc": [ + "-gen-attrdef-defs", + ], }, tblgen = ":mlir-tblgen", td_file = "include/mlir/Dialect/Transform/IR/TransformAttrs.td", @@ -11382,7 +11388,7 @@ cc_library( ":Rewrite", ":SideEffectInterfaces", ":Support", - ":TransformDialectEnumsIncGen", + ":TransformAttrsIncGen", ":TransformDialectIncGen", ":TransformDialectInterfaces", ":TransformDialectUtils", From 5dafe9dca867b90f20dcd71c620ad823aee4262b Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Wed, 11 Jun 2025 12:23:17 -0700 Subject: [PATCH 129/851] [libc] Reduce direct use of errno in src/stdlib and src/__support tests. (#143767) * Get rid of libc_errno assignments in str_to_* __support tests, since those API have been migrated to return error in a struct instead. * Migrate tests for atof and to strto* functions from and for strdup from to use ErrnoCheckingTest harness. --- libc/test/src/__support/CMakeLists.txt | 2 - .../test/src/__support/str_to_double_test.cpp | 1 - libc/test/src/__support/str_to_float_test.cpp | 1 - libc/test/src/__support/str_to_fp_test.h | 2 - .../src/__support/str_to_integer_test.cpp | 1 - libc/test/src/stdlib/CMakeLists.txt | 5 ++ libc/test/src/stdlib/StrtolTest.h | 60 +------------------ libc/test/src/stdlib/atof_test.cpp | 9 ++- libc/test/src/stdlib/strtod_test.cpp | 5 +- libc/test/src/stdlib/strtof_test.cpp | 5 +- libc/test/src/stdlib/strtold_test.cpp | 5 +- libc/test/src/string/CMakeLists.txt | 1 + libc/test/src/string/strdup_test.cpp | 13 ++-- 13 files changed, 24 insertions(+), 86 deletions(-) diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt index c1736c8fe59e2..4fb0dae86e5ca 100644 --- a/libc/test/src/__support/CMakeLists.txt +++ b/libc/test/src/__support/CMakeLists.txt @@ -127,7 +127,6 @@ add_libc_test( libc.src.__support.integer_literals libc.src.__support.str_to_float libc.src.__support.uint128 - libc.src.errno.errno ) @@ -140,7 +139,6 @@ add_libc_test( DEPENDS libc.src.__support.integer_literals libc.src.__support.str_to_integer - libc.src.errno.errno ) add_libc_test( diff --git a/libc/test/src/__support/str_to_double_test.cpp b/libc/test/src/__support/str_to_double_test.cpp index ccfa44f12d8ef..dc503aa16f08c 100644 --- a/libc/test/src/__support/str_to_double_test.cpp +++ b/libc/test/src/__support/str_to_double_test.cpp @@ -99,7 +99,6 @@ TEST(LlvmLibcStrToDblTest, SimpleDecimalConversionExtraTypes) { uint64_t double_output_mantissa = 0; uint32_t output_exp2 = 0; - LIBC_NAMESPACE::libc_errno = 0; auto double_result = internal::simple_decimal_conversion("123456789012345678900"); diff --git a/libc/test/src/__support/str_to_float_test.cpp b/libc/test/src/__support/str_to_float_test.cpp index 66f7db742eb45..03ae80fc2ee38 100644 --- a/libc/test/src/__support/str_to_float_test.cpp +++ b/libc/test/src/__support/str_to_float_test.cpp @@ -55,7 +55,6 @@ TEST(LlvmLibcStrToFltTest, SimpleDecimalConversionExtraTypes) { uint32_t float_output_mantissa = 0; uint32_t output_exp2 = 0; - LIBC_NAMESPACE::libc_errno = 0; auto float_result = internal::simple_decimal_conversion("123456789012345678900"); float_output_mantissa = float_result.num.mantissa; diff --git a/libc/test/src/__support/str_to_fp_test.h b/libc/test/src/__support/str_to_fp_test.h index c7bc57b845fe0..d349192f107c0 100644 --- a/libc/test/src/__support/str_to_fp_test.h +++ b/libc/test/src/__support/str_to_fp_test.h @@ -10,7 +10,6 @@ #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" #include "src/__support/uint128.h" -#include "src/errno/libc_errno.h" #include "test/UnitTest/Test.h" @@ -67,7 +66,6 @@ template struct LlvmLibcStrToFloatTest : public testing::Test { const int expectedErrno = 0) { StorageType actual_output_mantissa = 0; uint32_t actual_output_exp2 = 0; - LIBC_NAMESPACE::libc_errno = 0; auto result = internal::simple_decimal_conversion(numStart); diff --git a/libc/test/src/__support/str_to_integer_test.cpp b/libc/test/src/__support/str_to_integer_test.cpp index 34b645b4b38c8..1ec882b212b8a 100644 --- a/libc/test/src/__support/str_to_integer_test.cpp +++ b/libc/test/src/__support/str_to_integer_test.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" #include #include "test/UnitTest/Test.h" diff --git a/libc/test/src/stdlib/CMakeLists.txt b/libc/test/src/stdlib/CMakeLists.txt index 302971a078c17..45fd49b6d3526 100644 --- a/libc/test/src/stdlib/CMakeLists.txt +++ b/libc/test/src/stdlib/CMakeLists.txt @@ -9,6 +9,7 @@ add_libc_test( DEPENDS libc.src.errno.errno libc.src.stdlib.atof + libc.test.UnitTest.ErrnoCheckingTest ) add_header_library( @@ -64,6 +65,7 @@ add_fp_unittest( libc.src.errno.errno libc.src.stdlib.strtod libc.src.__support.FPUtil.fenv_impl + libc.test.UnitTest.ErrnoCheckingTest ) add_fp_unittest( @@ -76,6 +78,7 @@ add_fp_unittest( libc.src.errno.errno libc.src.stdlib.strtof libc.src.__support.FPUtil.fenv_impl + libc.test.UnitTest.ErrnoCheckingTest ) add_header_library( @@ -86,6 +89,7 @@ add_header_library( libc.src.__support.CPP.limits libc.src.__support.CPP.type_traits libc.src.errno.errno + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -133,6 +137,7 @@ add_libc_test( libc.src.errno.errno libc.src.__support.uint128 libc.src.stdlib.strtold + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h index ed302f14d03ef..03f0a6539c785 100644 --- a/libc/test/src/stdlib/StrtolTest.h +++ b/libc/test/src/stdlib/StrtolTest.h @@ -10,7 +10,7 @@ #include "src/__support/CPP/type_traits.h" #include "src/__support/ctype_utils.h" #include "src/__support/macros/properties/architectures.h" -#include "src/errno/libc_errno.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include @@ -18,7 +18,7 @@ using LIBC_NAMESPACE::cpp::is_signed_v; template -struct StrtoTest : public LIBC_NAMESPACE::testing::Test { +struct StrtoTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { using FunctionT = ReturnT (*)(const char *, char **, int); static constexpr ReturnT T_MAX = @@ -28,7 +28,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { void InvalidBase(FunctionT func) { const char *ten = "10"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(ten, nullptr, -1), ReturnT(0)); ASSERT_ERRNO_EQ(EINVAL); } @@ -38,23 +37,19 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // TODO: Look into collapsing these repeated segments. const char *ten = "10"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(ten, &str_end, 10), ReturnT(10)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - ten, ptrdiff_t(2)); - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(ten, nullptr, 10), ReturnT(10)); ASSERT_ERRNO_SUCCESS(); const char *hundred = "100"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(hundred, &str_end, 10), ReturnT(100)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - hundred, ptrdiff_t(3)); const char *big_number = "1234567890"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(big_number, &str_end, 10), ReturnT(1234567890)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - big_number, ptrdiff_t(10)); @@ -62,7 +57,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // This number is larger than 2^32, meaning that if long is only 32 bits // wide, strtol will return LONG_MAX. const char *bigger_number = "12345678900"; - LIBC_NAMESPACE::libc_errno = 0; if constexpr (sizeof(ReturnT) < 8) { ASSERT_EQ(func(bigger_number, &str_end, 10), T_MAX); ASSERT_ERRNO_EQ(ERANGE); @@ -73,14 +67,12 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { EXPECT_EQ(str_end - bigger_number, ptrdiff_t(11)); const char *too_big_number = "123456789012345678901"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(too_big_number, &str_end, 10), T_MAX); ASSERT_ERRNO_EQ(ERANGE); EXPECT_EQ(str_end - too_big_number, ptrdiff_t(21)); const char *long_number_range_test = "10000000000000000000000000000000000000000000000000"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(long_number_range_test, &str_end, 10), T_MAX); ASSERT_ERRNO_EQ(ERANGE); EXPECT_EQ(str_end - long_number_range_test, ptrdiff_t(50)); @@ -88,19 +80,16 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // For most negative numbers, the unsigned functions treat it the same as // casting a negative variable to an unsigned type. const char *negative = "-100"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(negative, &str_end, 10), ReturnT(-100)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - negative, ptrdiff_t(4)); const char *big_negative_number = "-1234567890"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(big_negative_number, &str_end, 10), ReturnT(-1234567890)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - big_negative_number, ptrdiff_t(11)); const char *too_big_negative_number = "-123456789012345678901"; - LIBC_NAMESPACE::libc_errno = 0; // If the number is signed, it should return the smallest negative number // for the current type, but if it's unsigned it should max out and return // the largest positive number for the current type. From the standard: @@ -118,73 +107,61 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { char *str_end = nullptr; const char *spaces_before = " 10"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(spaces_before, &str_end, 10), ReturnT(10)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - spaces_before, ptrdiff_t(7)); const char *spaces_after = "10 "; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(spaces_after, &str_end, 10), ReturnT(10)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - spaces_after, ptrdiff_t(2)); const char *word_before = "word10"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(word_before, &str_end, 10), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - word_before, ptrdiff_t(0)); const char *word_after = "10word"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(word_after, &str_end, 10), ReturnT(10)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - word_after, ptrdiff_t(2)); const char *two_numbers = "10 999"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(two_numbers, &str_end, 10), ReturnT(10)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - two_numbers, ptrdiff_t(2)); const char *two_signs = "--10 999"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(two_signs, &str_end, 10), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - two_signs, ptrdiff_t(0)); const char *sign_before = "+2=4"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(sign_before, &str_end, 10), ReturnT(2)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - sign_before, ptrdiff_t(2)); const char *sign_after = "2+2=4"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(sign_after, &str_end, 10), ReturnT(2)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - sign_after, ptrdiff_t(1)); const char *tab_before = "\t10"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(tab_before, &str_end, 10), ReturnT(10)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - tab_before, ptrdiff_t(3)); const char *all_together = "\t -12345and+67890"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(all_together, &str_end, 10), ReturnT(-12345)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - all_together, ptrdiff_t(9)); const char *just_spaces = " "; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(just_spaces, &str_end, 10), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - just_spaces, ptrdiff_t(0)); const char *just_space_and_sign = " +"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(just_space_and_sign, &str_end, 10), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - just_space_and_sign, ptrdiff_t(0)); @@ -203,12 +180,10 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { small_string[0] = static_cast( LIBC_NAMESPACE::internal::int_to_b36_char(first_digit)); if (first_digit < base) { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), static_cast(first_digit)); ASSERT_ERRNO_SUCCESS(); } else { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); } @@ -223,18 +198,15 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { small_string[1] = static_cast( LIBC_NAMESPACE::internal::int_to_b36_char(second_digit)); if (first_digit < base && second_digit < base) { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ( func(small_string, nullptr, base), static_cast(second_digit + (first_digit * base))); ASSERT_ERRNO_SUCCESS(); } else if (first_digit < base) { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), static_cast(first_digit)); ASSERT_ERRNO_SUCCESS(); } else { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); } @@ -255,14 +227,12 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { if (first_digit < base && second_digit < base && third_digit < base) { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), static_cast(third_digit + (second_digit * base) + (first_digit * base * base))); ASSERT_ERRNO_SUCCESS(); } else if (first_digit < base && second_digit < base) { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ( func(small_string, nullptr, base), static_cast(second_digit + (first_digit * base))); @@ -272,23 +242,19 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // The number is treated as a one digit hexadecimal. if (base == 16 && first_digit == 0 && second_digit == 33) { if (third_digit < base) { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), static_cast(third_digit)); ASSERT_ERRNO_SUCCESS(); } else { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); } } else { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), static_cast(first_digit)); ASSERT_ERRNO_SUCCESS(); } } else { - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(small_string, nullptr, base), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); } @@ -302,19 +268,16 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { char *str_end = nullptr; const char *no_prefix = "123abc"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(no_prefix, &str_end, 16), ReturnT(0x123abc)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - no_prefix, ptrdiff_t(6)); const char *yes_prefix = "0x456def"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(yes_prefix, &str_end, 16), ReturnT(0x456def)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - yes_prefix, ptrdiff_t(8)); const char *letter_after_prefix = "0xabc123"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(letter_after_prefix, &str_end, 16), ReturnT(0xabc123)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - letter_after_prefix, ptrdiff_t(8)); @@ -325,7 +288,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // Max size for unsigned 32 bit numbers const char *max_32_bit_value = "0xFFFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(max_32_bit_value, &str_end, 0), ((is_signed_v && sizeof(ReturnT) == 4) ? T_MAX @@ -334,7 +296,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { EXPECT_EQ(str_end - max_32_bit_value, ptrdiff_t(10)); const char *negative_max_32_bit_value = "-0xFFFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(negative_max_32_bit_value, &str_end, 0), ((is_signed_v && sizeof(ReturnT) == 4) ? T_MIN @@ -345,13 +306,11 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // Max size for signed 32 bit numbers const char *max_31_bit_value = "0x7FFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(max_31_bit_value, &str_end, 0), ReturnT(0x7FFFFFFF)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - max_31_bit_value, ptrdiff_t(10)); const char *negative_max_31_bit_value = "-0x7FFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(negative_max_31_bit_value, &str_end, 0), -ReturnT(0x7FFFFFFF)); ASSERT_ERRNO_SUCCESS(); @@ -360,7 +319,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // Max size for unsigned 64 bit numbers const char *max_64_bit_value = "0xFFFFFFFFFFFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(max_64_bit_value, &str_end, 0), (is_signed_v || sizeof(ReturnT) < 8 ? T_MAX @@ -371,7 +329,6 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // See the end of CleanBase10Decode for an explanation of how this large // negative number can end up as T_MAX. const char *negative_max_64_bit_value = "-0xFFFFFFFFFFFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ( func(negative_max_64_bit_value, &str_end, 0), (is_signed_v @@ -383,14 +340,12 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { // Max size for signed 64 bit numbers const char *max_63_bit_value = "0x7FFFFFFFFFFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(max_63_bit_value, &str_end, 0), (sizeof(ReturnT) < 8 ? T_MAX : ReturnT(0x7FFFFFFFFFFFFFFF))); ASSERT_ERRNO_EQ(sizeof(ReturnT) < 8 ? ERANGE : 0); EXPECT_EQ(str_end - max_63_bit_value, ptrdiff_t(18)); const char *negative_max_63_bit_value = "-0x7FFFFFFFFFFFFFFF"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(negative_max_63_bit_value, &str_end, 0), (sizeof(ReturnT) >= 8 ? -ReturnT(0x7FFFFFFFFFFFFFFF) : (is_signed_v ? T_MIN : T_MAX))); @@ -402,23 +357,19 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { char *str_end = nullptr; const char *just_prefix = "0x"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(just_prefix, &str_end, 16), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - just_prefix, ptrdiff_t(1)); - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(just_prefix, &str_end, 0), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - just_prefix, ptrdiff_t(1)); const char *prefix_with_x_after = "0xx"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(prefix_with_x_after, &str_end, 16), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - prefix_with_x_after, ptrdiff_t(1)); - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(prefix_with_x_after, &str_end, 0), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - prefix_with_x_after, ptrdiff_t(1)); @@ -428,43 +379,36 @@ struct StrtoTest : public LIBC_NAMESPACE::testing::Test { char *str_end = nullptr; const char *base_ten = "12345"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(base_ten, &str_end, 0), ReturnT(12345)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - base_ten, ptrdiff_t(5)); const char *base_sixteen_no_prefix = "123abc"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(base_sixteen_no_prefix, &str_end, 0), ReturnT(123)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - base_sixteen_no_prefix, ptrdiff_t(3)); const char *base_sixteen_with_prefix = "0x456def"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(base_sixteen_with_prefix, &str_end, 0), ReturnT(0x456def)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - base_sixteen_with_prefix, ptrdiff_t(8)); const char *base_eight_with_prefix = "012345"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(base_eight_with_prefix, &str_end, 0), ReturnT(012345)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - base_eight_with_prefix, ptrdiff_t(6)); const char *just_zero = "0"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(just_zero, &str_end, 0), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - just_zero, ptrdiff_t(1)); const char *just_zero_x = "0x"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(just_zero_x, &str_end, 0), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - just_zero_x, ptrdiff_t(1)); const char *just_zero_eight = "08"; - LIBC_NAMESPACE::libc_errno = 0; ASSERT_EQ(func(just_zero_eight, &str_end, 0), ReturnT(0)); ASSERT_ERRNO_SUCCESS(); EXPECT_EQ(str_end - just_zero_eight, ptrdiff_t(1)); diff --git a/libc/test/src/stdlib/atof_test.cpp b/libc/test/src/stdlib/atof_test.cpp index 1e4259b792d7e..92b904ecad94e 100644 --- a/libc/test/src/stdlib/atof_test.cpp +++ b/libc/test/src/stdlib/atof_test.cpp @@ -7,29 +7,28 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" #include "src/stdlib/atof.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include +using LlvmLibcAToFTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; // This is just a simple test to make sure that this function works at all. It's // functionally identical to strtod so the bulk of the testing is there. -TEST(LlvmLibcAToFTest, SimpleTest) { +TEST_F(LlvmLibcAToFTest, SimpleTest) { LIBC_NAMESPACE::fputil::FPBits expected_fp = LIBC_NAMESPACE::fputil::FPBits(uint64_t(0x405ec00000000000)); - LIBC_NAMESPACE::libc_errno = 0; EXPECT_THAT(LIBC_NAMESPACE::atof("123"), Succeeds(expected_fp.get_val())); } -TEST(LlvmLibcAToFTest, FailedParsingTest) { - LIBC_NAMESPACE::libc_errno = 0; +TEST_F(LlvmLibcAToFTest, FailedParsingTest) { // atof does not flag errors. EXPECT_THAT(LIBC_NAMESPACE::atof("???"), Succeeds(0.0)); } diff --git a/libc/test/src/stdlib/strtod_test.cpp b/libc/test/src/stdlib/strtod_test.cpp index 92d14640e6533..db3c1d73bd22e 100644 --- a/libc/test/src/stdlib/strtod_test.cpp +++ b/libc/test/src/stdlib/strtod_test.cpp @@ -7,9 +7,9 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" #include "src/stdlib/strtod.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/RoundingModeUtils.h" #include "test/UnitTest/Test.h" @@ -22,7 +22,7 @@ using LIBC_NAMESPACE::fputil::testing::RoundingMode; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; -class LlvmLibcStrToDTest : public LIBC_NAMESPACE::testing::Test, +class LlvmLibcStrToDTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest, ForceRoundingModeTest { public: void run_test(const char *inputString, const ptrdiff_t expectedStrLen, @@ -46,7 +46,6 @@ class LlvmLibcStrToDTest : public LIBC_NAMESPACE::testing::Test, LIBC_NAMESPACE::fputil::FPBits expected_fp = LIBC_NAMESPACE::fputil::FPBits(expectedRawData); - LIBC_NAMESPACE::libc_errno = 0; double result = LIBC_NAMESPACE::strtod(inputString, &str_end); if (expectedErrno == 0) EXPECT_THAT(result, Succeeds(expected_fp.get_val())); diff --git a/libc/test/src/stdlib/strtof_test.cpp b/libc/test/src/stdlib/strtof_test.cpp index 6a716c956291c..6df1ddda93bfa 100644 --- a/libc/test/src/stdlib/strtof_test.cpp +++ b/libc/test/src/stdlib/strtof_test.cpp @@ -7,9 +7,9 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" #include "src/stdlib/strtof.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/RoundingModeUtils.h" #include "test/UnitTest/Test.h" @@ -19,7 +19,7 @@ using LIBC_NAMESPACE::fputil::testing::ForceRoundingModeTest; using LIBC_NAMESPACE::fputil::testing::RoundingMode; -class LlvmLibcStrToFTest : public LIBC_NAMESPACE::testing::Test, +class LlvmLibcStrToFTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest, ForceRoundingModeTest { public: void run_test(const char *inputString, const ptrdiff_t expectedStrLen, @@ -43,7 +43,6 @@ class LlvmLibcStrToFTest : public LIBC_NAMESPACE::testing::Test, LIBC_NAMESPACE::fputil::FPBits expected_fp = LIBC_NAMESPACE::fputil::FPBits(expectedRawData); - LIBC_NAMESPACE::libc_errno = 0; float result = LIBC_NAMESPACE::strtof(inputString, &str_end); EXPECT_EQ(str_end - inputString, expectedStrLen); diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp index b209c85b88e36..eb4056dc7ba64 100644 --- a/libc/test/src/stdlib/strtold_test.cpp +++ b/libc/test/src/stdlib/strtold_test.cpp @@ -8,9 +8,9 @@ #include "src/__support/FPUtil/FPBits.h" #include "src/__support/uint128.h" -#include "src/errno/libc_errno.h" #include "src/stdlib/strtold.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include @@ -25,7 +25,7 @@ #error "Unknown long double type" #endif -class LlvmLibcStrToLDTest : public LIBC_NAMESPACE::testing::Test { +class LlvmLibcStrToLDTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { public: #if defined(LIBC_TYPES_LONG_DOUBLE_IS_FLOAT64) void run_test(const char *inputString, const ptrdiff_t expectedStrLen, @@ -80,7 +80,6 @@ class LlvmLibcStrToLDTest : public LIBC_NAMESPACE::testing::Test { FPBits(static_cast(expectedRawData)); const int expected_errno = expectedErrno; - LIBC_NAMESPACE::libc_errno = 0; long double result = LIBC_NAMESPACE::strtold(inputString, &str_end); LIBC_NAMESPACE::fputil::FPBits actual_fp = diff --git a/libc/test/src/string/CMakeLists.txt b/libc/test/src/string/CMakeLists.txt index a675373938e99..ced60750a45c7 100644 --- a/libc/test/src/string/CMakeLists.txt +++ b/libc/test/src/string/CMakeLists.txt @@ -168,6 +168,7 @@ add_libc_test( DEPENDS libc.src.string.strdup libc.src.errno.errno + libc.test.UnitTest.ErrnoCheckingTest ) # FIXME: This is failing on the bot for some reason, disable for now. diff --git a/libc/test/src/string/strdup_test.cpp b/libc/test/src/string/strdup_test.cpp index 20b85c37637dd..4b18fc7f1bdee 100644 --- a/libc/test/src/string/strdup_test.cpp +++ b/libc/test/src/string/strdup_test.cpp @@ -6,14 +6,15 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" #include "src/string/strdup.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" -TEST(LlvmLibcStrDupTest, EmptyString) { +using LlvmLibcStrDupTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcStrDupTest, EmptyString) { const char *empty = ""; - LIBC_NAMESPACE::libc_errno = 0; char *result = LIBC_NAMESPACE::strdup(empty); ASSERT_ERRNO_SUCCESS(); @@ -23,10 +24,9 @@ TEST(LlvmLibcStrDupTest, EmptyString) { ::free(result); } -TEST(LlvmLibcStrDupTest, AnyString) { +TEST_F(LlvmLibcStrDupTest, AnyString) { const char *abc = "abc"; - LIBC_NAMESPACE::libc_errno = 0; char *result = LIBC_NAMESPACE::strdup(abc); ASSERT_ERRNO_SUCCESS(); @@ -36,8 +36,7 @@ TEST(LlvmLibcStrDupTest, AnyString) { ::free(result); } -TEST(LlvmLibcStrDupTest, NullPtr) { - LIBC_NAMESPACE::libc_errno = 0; +TEST_F(LlvmLibcStrDupTest, NullPtr) { char *result = LIBC_NAMESPACE::strdup(nullptr); ASSERT_ERRNO_SUCCESS(); From 22fd11fe66a0d64f5ef359e21ae67a7d40936eaf Mon Sep 17 00:00:00 2001 From: Abhina Sree Date: Wed, 11 Jun 2025 15:26:49 -0400 Subject: [PATCH 130/851] [SystemZ][z/OS] Refactor AutoConvert.h to remove large MVS guard (#143174) This AutoConvert.h header frequently gets mislabeled as an unused include because it is guarded by MVS internally and every usage is also guarded. This refactors the change to remove this guard and instead make these functions a noop on other non-z/OS platforms. --- llvm/include/llvm/Support/AutoConvert.h | 46 +++++++++++++++++++++++-- llvm/lib/Support/AutoConvert.cpp | 21 ----------- llvm/lib/Support/InitLLVM.cpp | 30 ++++++++++------ llvm/lib/Support/MemoryBuffer.cpp | 10 +++--- llvm/lib/Support/raw_ostream.cpp | 19 +++++----- 5 files changed, 78 insertions(+), 48 deletions(-) diff --git a/llvm/include/llvm/Support/AutoConvert.h b/llvm/include/llvm/Support/AutoConvert.h index 352493e9be25f..56ad91425bcc3 100644 --- a/llvm/include/llvm/Support/AutoConvert.h +++ b/llvm/include/llvm/Support/AutoConvert.h @@ -16,6 +16,7 @@ #ifdef __MVS__ #include <_Ccsid.h> +#endif #ifdef __cplusplus #include "llvm/Support/ErrorOr.h" #include @@ -28,9 +29,11 @@ #ifdef __cplusplus extern "C" { #endif /* __cplusplus */ + int enablezOSAutoConversion(int FD); int disablezOSAutoConversion(int FD); int restorezOSStdHandleAutoConversion(int FD); + #ifdef __cplusplus } #endif /* __cplusplus */ @@ -38,6 +41,46 @@ int restorezOSStdHandleAutoConversion(int FD); #ifdef __cplusplus namespace llvm { +inline std::error_code disableAutoConversion(int FD) { +#ifdef __MVS__ + if (::disablezOSAutoConversion(FD) == -1) + return errnoAsErrorCode(); +#endif + return std::error_code(); +} + +inline std::error_code enableAutoConversion(int FD) { +#ifdef __MVS__ + if (::enablezOSAutoConversion(FD) == -1) + return errnoAsErrorCode(); +#endif + return std::error_code(); +} + +inline std::error_code restoreStdHandleAutoConversion(int FD) { +#ifdef __MVS__ + if (::restorezOSStdHandleAutoConversion(FD) == -1) + return errnoAsErrorCode(); +#endif + return std::error_code(); +} + +inline std::error_code setFileTag(int FD, int CCSID, bool Text) { +#ifdef __MVS__ + return setzOSFileTag(FD, CCSID, Text); +#endif + return std::error_code(); +} + +inline ErrorOr needConversion(const char *FileName, const int FD = -1) { +#ifdef __MVS__ + return needzOSConversion(FileName, FD); +#endif + return false; +} + +#ifdef __MVS__ + /** \brief Disable the z/OS enhanced ASCII auto-conversion for the file * descriptor. */ @@ -63,9 +106,8 @@ ErrorOr<__ccsid_t> getzOSFileTag(const char *FileName, const int FD = -1); */ ErrorOr needzOSConversion(const char *FileName, const int FD = -1); +#endif /* __MVS__*/ } /* namespace llvm */ #endif /* __cplusplus */ -#endif /* __MVS__ */ - #endif /* LLVM_SUPPORT_AUTOCONVERT_H */ diff --git a/llvm/lib/Support/AutoConvert.cpp b/llvm/lib/Support/AutoConvert.cpp index f7918548df1d0..c69e9a8f97c0e 100644 --- a/llvm/lib/Support/AutoConvert.cpp +++ b/llvm/lib/Support/AutoConvert.cpp @@ -83,27 +83,6 @@ int enablezOSAutoConversion(int FD) { return fcntl(FD, F_CONTROL_CVT, &Query); } -std::error_code llvm::disablezOSAutoConversion(int FD) { - if (::disablezOSAutoConversion(FD) == -1) - return errnoAsErrorCode(); - - return std::error_code(); -} - -std::error_code llvm::enablezOSAutoConversion(int FD) { - if (::enablezOSAutoConversion(FD) == -1) - return errnoAsErrorCode(); - - return std::error_code(); -} - -std::error_code llvm::restorezOSStdHandleAutoConversion(int FD) { - if (::restorezOSStdHandleAutoConversion(FD) == -1) - return errnoAsErrorCode(); - - return std::error_code(); -} - std::error_code llvm::setzOSFileTag(int FD, int CCSID, bool Text) { assert((!Text || (CCSID != FT_UNTAGGED && CCSID != FT_BINARY)) && "FT_UNTAGGED and FT_BINARY are not allowed for text files"); diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp index 50f7a43cc34a7..b8fbfd21c4f28 100644 --- a/llvm/lib/Support/InitLLVM.cpp +++ b/llvm/lib/Support/InitLLVM.cpp @@ -18,18 +18,28 @@ #include "llvm/Support/Windows/WindowsSupport.h" #endif -#ifdef __MVS__ +#if defined(HAVE_UNISTD_H) #include +#else +#ifndef STDIN_FILENO +#define STDIN_FILENO 0 +#endif +#ifndef STDOUT_FILENO +#define STDOUT_FILENO 1 +#endif +#ifndef STDERR_FILENO +#define STDERR_FILENO 2 +#endif +#endif void CleanupStdHandles(void *Cookie) { llvm::raw_ostream *Outs = &llvm::outs(), *Errs = &llvm::errs(); Outs->flush(); Errs->flush(); - llvm::restorezOSStdHandleAutoConversion(STDIN_FILENO); - llvm::restorezOSStdHandleAutoConversion(STDOUT_FILENO); - llvm::restorezOSStdHandleAutoConversion(STDERR_FILENO); + llvm::restoreStdHandleAutoConversion(STDIN_FILENO); + llvm::restoreStdHandleAutoConversion(STDOUT_FILENO); + llvm::restoreStdHandleAutoConversion(STDERR_FILENO); } -#endif using namespace llvm; using namespace llvm::sys; @@ -41,10 +51,10 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv, assert(!Initialized && "InitLLVM was already initialized!"); Initialized = true; #endif -#ifdef __MVS__ + // Bring stdin/stdout/stderr into a known state. sys::AddSignalHandler(CleanupStdHandles, nullptr); -#endif + if (InstallPipeSignalExitHandler) // The pipe signal handler must be installed before any other handlers are // registered. This is because the Unix \ref RegisterHandlers function does @@ -68,8 +78,8 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv, // If turning on conversion for stderr fails then the error message // may be garbled. There is no solution to this problem. - ExitOnErr(errorCodeToError(llvm::enablezOSAutoConversion(STDERR_FILENO))); - ExitOnErr(errorCodeToError(llvm::enablezOSAutoConversion(STDOUT_FILENO))); + ExitOnErr(errorCodeToError(llvm::enableAutoConversion(STDERR_FILENO))); + ExitOnErr(errorCodeToError(llvm::enableAutoConversion(STDOUT_FILENO))); #endif #ifdef _WIN32 @@ -97,8 +107,6 @@ InitLLVM::InitLLVM(int &Argc, const char **&Argv, } InitLLVM::~InitLLVM() { -#ifdef __MVS__ CleanupStdHandles(nullptr); -#endif llvm_shutdown(); } diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp index e2044bcc4e4f0..601f11f6d23c8 100644 --- a/llvm/lib/Support/MemoryBuffer.cpp +++ b/llvm/lib/Support/MemoryBuffer.cpp @@ -15,6 +15,7 @@ #include "llvm/ADT/SmallString.h" #include "llvm/Config/config.h" #include "llvm/Support/Alignment.h" +#include "llvm/Support/AutoConvert.h" #include "llvm/Support/Errc.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" @@ -34,9 +35,6 @@ #include #endif -#ifdef __MVS__ -#include "llvm/Support/AutoConvert.h" -#endif using namespace llvm; //===----------------------------------------------------------------------===// @@ -508,15 +506,15 @@ getOpenFileImpl(sys::fs::file_t FD, const Twine &Filename, uint64_t FileSize, } #ifdef __MVS__ - ErrorOr NeedConversion = needzOSConversion(Filename.str().c_str(), FD); - if (std::error_code EC = NeedConversion.getError()) + ErrorOr NeedsConversion = needConversion(Filename.str().c_str(), FD); + if (std::error_code EC = NeedsConversion.getError()) return EC; // File size may increase due to EBCDIC -> UTF-8 conversion, therefore we // cannot trust the file size and we create the memory buffer by copying // off the stream. // Note: This only works with the assumption of reading a full file (i.e, // Offset == 0 and MapSize == FileSize). Reading a file slice does not work. - if (Offset == 0 && MapSize == FileSize && *NeedConversion) + if (*NeedsConversion && Offset == 0 && MapSize == FileSize) return getMemoryBufferForStream(FD, Filename); #endif diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp index 16631a63d1921..07b99896543bd 100644 --- a/llvm/lib/Support/raw_ostream.cpp +++ b/llvm/lib/Support/raw_ostream.cpp @@ -894,21 +894,24 @@ void raw_fd_ostream::anchor() {} raw_fd_ostream &llvm::outs() { // Set buffer settings to model stdout behavior. std::error_code EC; -#ifdef __MVS__ - EC = enablezOSAutoConversion(STDOUT_FILENO); - assert(!EC); -#endif + + // On z/OS we need to enable auto conversion + static std::error_code EC1 = enableAutoConversion(STDOUT_FILENO); + assert(!EC1); + (void)EC1; + static raw_fd_ostream S("-", EC, sys::fs::OF_None); assert(!EC); return S; } raw_fd_ostream &llvm::errs() { - // Set standard error to be unbuffered. -#ifdef __MVS__ - std::error_code EC = enablezOSAutoConversion(STDERR_FILENO); + // On z/OS we need to enable auto conversion + static std::error_code EC = enableAutoConversion(STDERR_FILENO); assert(!EC); -#endif + (void)EC; + + // Set standard error to be unbuffered. static raw_fd_ostream S(STDERR_FILENO, false, true); return S; } From 34a1b8ce2518d7868c080519a05892cd3b197192 Mon Sep 17 00:00:00 2001 From: Razvan Lupusoru Date: Wed, 11 Jun 2025 12:37:08 -0700 Subject: [PATCH 131/851] [acc] acc.loop verifier now requires parallelism determination flag (#143720) The OpenACC specification for `acc loop` describe that a loop's parallelism determination mode is either auto, independent, or seq. The rules are as follows. - As per OpenACC 3.3 standard section 2.9.6 independent clause: A loop construct with no auto or seq clause is treated as if it has the independent clause when it is an orphaned loop construct or its parent compute construct is a parallel construct. - As per OpenACC 3.3 standard section 2.9.7 auto clause: When the parent compute construct is a kernels construct, a loop construct with no independent or seq clause is treated as if it has the auto clause. - Additionally, loops marked with gang, worker, or vector are not guaranteed to be parallel. Specifically noted in 2.9.7 auto clause: If not, or if it is unable to make a determination, it must treat the auto clause as if it is a seq clause, and it must ignore any gang, worker, or vector clauses on the loop construct. The verifier for `acc.loop` was updated to enforce this marking because the context in which a loop appears is not trivially determined once IR transformations begin. For example, orphaned loops are implicitly `independent`, but after inlining into an `acc.kernels` region they would be implicitly considered `auto`. Thus now the verifier requires that a frontend specifically generates acc dialect with this marking since it knows the context. --- mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 35 +++++++++-- mlir/test/Dialect/OpenACC/canonicalize.mlir | 4 +- mlir/test/Dialect/OpenACC/invalid.mlir | 28 ++++----- mlir/test/Dialect/OpenACC/legalize-data.mlir | 16 ++--- mlir/test/Dialect/OpenACC/ops.mlir | 66 ++++++++++---------- 5 files changed, 86 insertions(+), 63 deletions(-) diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp index 21e6b9d85f1a1..0dfead98b7e73 100644 --- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp +++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp @@ -2461,10 +2461,34 @@ LogicalResult acc::LoopOp::verify() { if (hasDuplicateDeviceTypes(getAuto_(), deviceTypes) || hasDuplicateDeviceTypes(getIndependent(), deviceTypes) || hasDuplicateDeviceTypes(getSeq(), deviceTypes)) { - return emitError() << "only one of \"" << acc::LoopOp::getAutoAttrStrName() - << "\", " << getIndependentAttrName() << ", " - << getSeqAttrName() - << " can be present at the same time"; + return emitError() << "only one of auto, independent, seq can be present " + "at the same time"; + } + + // Check that at least one of auto, independent, or seq is present + // for the device-independent default clauses. + auto hasDeviceNone = [](mlir::acc::DeviceTypeAttr attr) -> bool { + return attr.getValue() == mlir::acc::DeviceType::None; + }; + bool hasDefaultSeq = + getSeqAttr() + ? llvm::any_of(getSeqAttr().getAsRange(), + hasDeviceNone) + : false; + bool hasDefaultIndependent = + getIndependentAttr() + ? llvm::any_of( + getIndependentAttr().getAsRange(), + hasDeviceNone) + : false; + bool hasDefaultAuto = + getAuto_Attr() + ? llvm::any_of(getAuto_Attr().getAsRange(), + hasDeviceNone) + : false; + if (!hasDefaultSeq && !hasDefaultIndependent && !hasDefaultAuto) { + return emitError() + << "at least one of auto, independent, seq must be present"; } // Gang, worker and vector are incompatible with seq. @@ -2482,8 +2506,7 @@ LogicalResult acc::LoopOp::verify() { deviceTypeAttr.getValue()) || getGangValue(mlir::acc::GangArgType::Static, deviceTypeAttr.getValue())) - return emitError() - << "gang, worker or vector cannot appear with the seq attr"; + return emitError() << "gang, worker or vector cannot appear with seq"; } } diff --git a/mlir/test/Dialect/OpenACC/canonicalize.mlir b/mlir/test/Dialect/OpenACC/canonicalize.mlir index e43a27f6b9e89..fdc8e6b5cae6e 100644 --- a/mlir/test/Dialect/OpenACC/canonicalize.mlir +++ b/mlir/test/Dialect/OpenACC/canonicalize.mlir @@ -116,10 +116,10 @@ func.func @testhostdataop(%a: memref, %ifCond: i1) -> () { acc.host_data dataOperands(%0 : memref) if(%false) { acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { acc.yield - } attributes { inclusiveUpperbound = array } + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { acc.yield - } attributes { inclusiveUpperbound = array } + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.terminator } return diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir index aadf189273212..8f6e961a06163 100644 --- a/mlir/test/Dialect/OpenACC/invalid.mlir +++ b/mlir/test/Dialect/OpenACC/invalid.mlir @@ -2,7 +2,7 @@ %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +// expected-error@+1 {{gang, worker or vector cannot appear with seq}} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { "test.openacc_dummy_op"() : () -> () acc.yield @@ -12,7 +12,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +// expected-error@+1 {{gang, worker or vector cannot appear with seq}} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { "test.openacc_dummy_op"() : () -> () acc.yield @@ -22,7 +22,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +// expected-error@+1 {{gang, worker or vector cannot appear with seq}} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { "test.openacc_dummy_op"() : () -> () acc.yield @@ -32,7 +32,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +// expected-error@+1 {{gang, worker or vector cannot appear with seq}} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { "test.openacc_dummy_op"() : () -> () acc.yield @@ -42,7 +42,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +// expected-error@+1 {{gang, worker or vector cannot appear with seq}} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { "test.openacc_dummy_op"() : () -> () acc.yield @@ -52,7 +52,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +// expected-error@+1 {{gang, worker or vector cannot appear with seq}} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { "test.openacc_dummy_op"() : () -> () acc.yield @@ -62,7 +62,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}} +// expected-error@+1 {{gang, worker or vector cannot appear with seq}} acc.loop { "test.openacc_dummy_op"() : () -> () acc.yield @@ -72,7 +72,7 @@ acc.loop { // expected-error@+1 {{expected non-empty body.}} acc.loop { -} +} attributes {independent = [#acc.device_type]} // ----- @@ -99,7 +99,7 @@ acc.loop { %1 = arith.constant 1 : i32 %2 = arith.constant 10 : i32 -// expected-error@+1 {{only one of "auto", "independent", "seq" can be present at the same time}} +// expected-error@+1 {{only one of auto, independent, seq can be present at the same time}} acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { acc.yield } attributes {auto_ = [#acc.device_type], seq = [#acc.device_type], inclusiveUpperbound = array} @@ -168,7 +168,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32){ // expected-error@+1 {{'acc.init' op cannot be nested in a compute operation}} acc.init acc.yield -} attributes {inclusiveUpperbound = array} +} attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} // ----- @@ -186,7 +186,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { // expected-error@+1 {{'acc.shutdown' op cannot be nested in a compute operation}} acc.shutdown acc.yield -} attributes {inclusiveUpperbound = array} +} attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} // ----- @@ -198,7 +198,7 @@ acc.loop control(%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) { acc.shutdown }) : () -> () acc.yield -} attributes {inclusiveUpperbound = array} +} attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} // ----- @@ -797,7 +797,7 @@ func.func @acc_loop_container() { scf.yield } acc.yield - } attributes { collapse = [2], collapseDeviceType = [#acc.device_type] } + } attributes { collapse = [2], collapseDeviceType = [#acc.device_type], independent = [#acc.device_type]} return } @@ -816,6 +816,6 @@ func.func @acc_loop_container() { scf.yield } acc.yield - } attributes { collapse = [3], collapseDeviceType = [#acc.device_type] } + } attributes { collapse = [3], collapseDeviceType = [#acc.device_type], independent = [#acc.device_type]} return } diff --git a/mlir/test/Dialect/OpenACC/legalize-data.mlir b/mlir/test/Dialect/OpenACC/legalize-data.mlir index 28ef6761a6ef4..40604dcc736de 100644 --- a/mlir/test/Dialect/OpenACC/legalize-data.mlir +++ b/mlir/test/Dialect/OpenACC/legalize-data.mlir @@ -96,7 +96,7 @@ func.func @test(%a: memref<10xf32>) { acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) { %ci = memref.load %a[%i] : memref<10xf32> acc.yield - } + } attributes {independent = [#acc.device_type]} acc.yield } return @@ -109,7 +109,7 @@ func.func @test(%a: memref<10xf32>) { // CHECK: acc.loop control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index) step (%{{.*}} : index) { // DEVICE: %{{.*}} = memref.load %[[CREATE:.*]][%[[I]]] : memref<10xf32> // CHECK: acc.yield -// CHECK: } +// CHECK: } attributes {independent = [#acc.device_type]} // CHECK: acc.yield // CHECK: } @@ -134,7 +134,7 @@ func.func @test(%a: memref<10xf32>) { acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) { %ci = memref.load %a[%i] : memref<10xf32> acc.yield - } + } attributes {independent = [#acc.device_type]} acc.yield } return @@ -147,7 +147,7 @@ func.func @test(%a: memref<10xf32>) { // CHECK: acc.loop control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index) step (%{{.*}} : index) { // DEVICE: %{{.*}} = memref.load %[[PRIVATE:.*]][%[[I]]] : memref<10xf32> // CHECK: acc.yield -// CHECK: } +// CHECK: } attributes {independent = [#acc.device_type]} // CHECK: acc.yield // CHECK: } @@ -172,7 +172,7 @@ func.func @test(%a: memref<10xf32>) { acc.loop private(@privatization_memref_10_f32 -> %p1 : memref<10xf32>) control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) { %ci = memref.load %a[%i] : memref<10xf32> acc.yield - } + } attributes {independent = [#acc.device_type]} acc.yield } return @@ -185,7 +185,7 @@ func.func @test(%a: memref<10xf32>) { // CHECK: acc.loop private(@privatization_memref_10_f32 -> %[[PRIVATE]] : memref<10xf32>) control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index) step (%{{.*}} : index) { // DEVICE: %{{.*}} = memref.load %[[PRIVATE:.*]][%[[I]]] : memref<10xf32> // CHECK: acc.yield -// CHECK: } +// CHECK: } attributes {independent = [#acc.device_type]} // CHECK: acc.yield // CHECK: } @@ -210,7 +210,7 @@ func.func @test(%a: memref<10xf32>) { acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) { %ci = memref.load %a[%i] : memref<10xf32> acc.yield - } + } attributes {seq = [#acc.device_type]} acc.yield } return @@ -223,7 +223,7 @@ func.func @test(%a: memref<10xf32>) { // CHECK: acc.loop control(%[[I:.*]] : index) = (%{{.*}} : index) to (%{{.*}} : index) step (%{{.*}} : index) { // DEVICE: %{{.*}} = memref.load %[[PRIVATE:.*]][%[[I]]] : memref<10xf32> // CHECK: acc.yield -// CHECK: } +// CHECK: } attributes {seq = [#acc.device_type]} // CHECK: acc.yield // CHECK: } diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir index 550f295f074a2..97278f869534b 100644 --- a/mlir/test/Dialect/OpenACC/ops.mlir +++ b/mlir/test/Dialect/OpenACC/ops.mlir @@ -19,7 +19,7 @@ func.func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x %co = arith.addf %cij, %p : f32 memref.store %co, %C[%arg3, %arg4] : memref<10x10xf32> acc.yield - } attributes { collapse = [3], collapseDeviceType = [#acc.device_type], inclusiveUpperbound = array} + } attributes { collapse = [3], collapseDeviceType = [#acc.device_type], inclusiveUpperbound = array, independent = [#acc.device_type]} acc.yield } @@ -40,7 +40,7 @@ func.func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x // CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32 // CHECK-NEXT: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32> // CHECK-NEXT: acc.yield -// CHECK-NEXT: } attributes {collapse = [3], collapseDeviceType = [#acc.device_type], inclusiveUpperbound = array} +// CHECK-NEXT: } attributes {collapse = [3], collapseDeviceType = [#acc.device_type], inclusiveUpperbound = array, independent = [#acc.device_type]} // CHECK-NEXT: acc.yield // CHECK-NEXT: } // CHECK-NEXT: return %{{.*}} : memref<10x10xf32> @@ -129,7 +129,7 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x %tmp = arith.addf %axy, %bxy : f32 memref.store %tmp, %c[%y] : memref<10xf32> acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop control(%i : index) = (%lb : index) to (%c10 : index) step (%st : index) { // for i = 0 to 10 step 1 @@ -139,9 +139,9 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x %z = arith.addf %ci, %dx : f32 memref.store %z, %d[%x] : memref<10xf32> acc.yield - } attributes {inclusiveUpperbound = array, seq = [#acc.device_type]} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type], seq = [#acc.device_type]} acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.yield } acc.terminator @@ -166,16 +166,16 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x // CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32 // CHECK-NEXT: memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> // CHECK-NEXT: acc.yield -// CHECK-NEXT: } attributes {inclusiveUpperbound = array} +// CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} // CHECK-NEXT: acc.loop control(%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) { // CHECK-NEXT: %{{.*}} = memref.load %{{.*}}[%{{.*}}] : memref<10xf32> // CHECK-NEXT: %{{.*}} = memref.load %{{.*}}[%{{.*}}] : memref<10xf32> // CHECK-NEXT: %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32 // CHECK-NEXT: memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32> // CHECK-NEXT: acc.yield -// CHECK-NEXT: } attributes {inclusiveUpperbound = array, seq = [#acc.device_type]} +// CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type], seq = [#acc.device_type]} // CHECK-NEXT: acc.yield -// CHECK-NEXT: } attributes {inclusiveUpperbound = array} +// CHECK-NEXT: } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} // CHECK-NEXT: acc.yield // CHECK-NEXT: } // CHECK-NEXT: acc.terminator @@ -196,72 +196,72 @@ func.func @testloopop(%a : memref<10xf32>) -> () { acc.loop gang vector worker control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop gang({num=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop gang({static=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop worker(%i64Value: i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop worker(%i32Value: i32) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop worker(%idxValue: index) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop vector(%i64Value: i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop vector(%i32Value: i32) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop vector(%idxValue: index) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop gang({num=%i64Value: i64}) worker vector control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop gang({num=%i64Value: i64, static=%i64Value: i64}) worker(%i64Value: i64) vector(%i64Value: i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop gang({num=%i32Value: i32, static=%idxValue: index}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop tile({%i64Value : i64, %i64Value : i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop tile({%i32Value : i32, %i32Value : i32}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop gang({static=%i64Value: i64, num=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.loop gang({dim=%i64Value : i64, static=%i64Value: i64}) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} %b = acc.cache varPtr(%a : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32> acc.loop cache(%b : memref<10xf32>) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { "test.openacc_dummy_op"() : () -> () acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} return } @@ -271,7 +271,7 @@ func.func @testloopop(%a : memref<10xf32>) -> () { // CHECK: acc.loop // CHECK-NEXT: "test.openacc_dummy_op"() : () -> () // CHECK-NEXT: acc.yield -// CHECK-NEXT: attributes {inclusiveUpperbound = array} +// CHECK-NEXT: attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} // CHECK: acc.loop gang({num=[[I64VALUE]] : i64}) // CHECK-NEXT: "test.openacc_dummy_op"() : () -> () // CHECK-NEXT: acc.yield @@ -343,7 +343,7 @@ func.func @acc_loop_multiple_block() { cf.br ^bb1(%22 : index) ^bb3: acc.yield - } attributes {inclusiveUpperbound = array} + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.yield } return @@ -1477,7 +1477,7 @@ func.func @acc_reduc_test(%a : i64) -> () { acc.parallel reduction(@reduction_add_i64 -> %a : i64) { acc.loop reduction(@reduction_add_i64 -> %a : i64) control(%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { acc.yield - } attributes { inclusiveUpperbound = array } + } attributes {inclusiveUpperbound = array, independent = [#acc.device_type]} acc.yield } return @@ -1869,21 +1869,21 @@ func.func @acc_combined() { acc.parallel combined(loop) { acc.loop combined(parallel) control(%arg3 : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { acc.yield - } + } attributes {independent = [#acc.device_type]} acc.terminator } acc.kernels combined(loop) { acc.loop combined(kernels) control(%arg3 : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { acc.yield - } + } attributes {auto_ = [#acc.device_type]} acc.terminator } acc.serial combined(loop) { acc.loop combined(serial) control(%arg3 : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) { acc.yield - } + } attributes {seq = [#acc.device_type]} acc.terminator } @@ -1949,7 +1949,7 @@ func.func @acc_loop_container() { scf.yield } acc.yield - } + } attributes {independent = [#acc.device_type]} return } @@ -1971,7 +1971,7 @@ func.func @acc_loop_container() { scf.yield } acc.yield - } attributes { collapse = [2], collapseDeviceType = [#acc.device_type] } + } attributes { collapse = [2], collapseDeviceType = [#acc.device_type], independent = [#acc.device_type]} return } From 02161c635fd70e0214bd8b8320a80992c50ec325 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Wed, 11 Jun 2025 12:44:51 -0700 Subject: [PATCH 132/851] [NVPTX] Misc table-gen cleanup (NFC) (#142877) --- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 196 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 2504 ++++------------- .../Target/NVPTX/NVPTXReplaceImageHandles.cpp | 840 +++--- 3 files changed, 1065 insertions(+), 2475 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index b646d39194c7e..9ca4e8d20650a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -160,7 +160,6 @@ def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">; def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">; def True : Predicate<"true">; -def False : Predicate<"false">; class hasPTX: Predicate<"Subtarget->getPTXVersion() >= " # version>; class hasSM: Predicate<"Subtarget->getSmVersion() >= " # version>; @@ -257,6 +256,11 @@ def BF16X2RT : RegTyInfo; // "prmt.b32${mode}">; // ---> "prmt.b32${mode} \t$d, $a, $b, $c;" // +// * BasicFlagsNVPTXInst<(outs Int64Regs:$state), +// (ins ADDR:$addr), +// "mbarrier.arrive.b64">; +// ---> "mbarrier.arrive.b64 \t$state, [$addr];" +// class BasicFlagsNVPTXInst pattern = []> : NVPTXInst< @@ -274,7 +278,11 @@ class BasicFlagsNVPTXInst(!getdagarg(ins_dag, i)), "ADDR"), + "[$" # !getdagname(ins_dag, i) # "]", + "$" # !getdagname(ins_dag, i) + ) + ), ", "))), ";"), pattern>; @@ -956,31 +964,17 @@ def mul_wide_signed : SDNode<"NVPTXISD::MUL_WIDE_SIGNED", SDTMulWide>; def mul_wide_unsigned : SDNode<"NVPTXISD::MUL_WIDE_UNSIGNED", SDTMulWide>; // Matchers for signed, unsigned mul.wide ISD nodes. -def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), - (MULWIDES32 $a, $b)>, - Requires<[doMulWide]>; -def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), - (MULWIDES32Imm $a, imm:$b)>, - Requires<[doMulWide]>; -def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), - (MULWIDEU32 $a, $b)>, - Requires<[doMulWide]>; -def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)), - (MULWIDEU32Imm $a, imm:$b)>, - Requires<[doMulWide]>; +let Predicates = [doMulWide] in { + def : Pat<(i32 (mul_wide_signed i16:$a, i16:$b)), (MULWIDES32 $a, $b)>; + def : Pat<(i32 (mul_wide_signed i16:$a, imm:$b)), (MULWIDES32Imm $a, imm:$b)>; + def : Pat<(i32 (mul_wide_unsigned i16:$a, i16:$b)), (MULWIDEU32 $a, $b)>; + def : Pat<(i32 (mul_wide_unsigned i16:$a, imm:$b)), (MULWIDEU32Imm $a, imm:$b)>; -def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)), - (MULWIDES64 $a, $b)>, - Requires<[doMulWide]>; -def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)), - (MULWIDES64Imm $a, imm:$b)>, - Requires<[doMulWide]>; -def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)), - (MULWIDEU64 $a, $b)>, - Requires<[doMulWide]>; -def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), - (MULWIDEU64Imm $a, imm:$b)>, - Requires<[doMulWide]>; + def : Pat<(i64 (mul_wide_signed i32:$a, i32:$b)), (MULWIDES64 $a, $b)>; + def : Pat<(i64 (mul_wide_signed i32:$a, imm:$b)), (MULWIDES64Imm $a, imm:$b)>; + def : Pat<(i64 (mul_wide_unsigned i32:$a, i32:$b)), (MULWIDEU64 $a, $b)>; + def : Pat<(i64 (mul_wide_unsigned i32:$a, imm:$b)), (MULWIDEU64Imm $a, imm:$b)>; +} // Predicates used for converting some patterns to mul.wide. def SInt32Const : PatLeaf<(imm), [{ @@ -1106,18 +1100,12 @@ defm MAD32 : MAD<"mad.lo.s32", i32, Int32Regs, i32imm>; defm MAD64 : MAD<"mad.lo.s64", i64, Int64Regs, i64imm>; } -def INEG16 : - BasicNVPTXInst<(outs Int16Regs:$dst), (ins Int16Regs:$src), - "neg.s16", - [(set i16:$dst, (ineg i16:$src))]>; -def INEG32 : - BasicNVPTXInst<(outs Int32Regs:$dst), (ins Int32Regs:$src), - "neg.s32", - [(set i32:$dst, (ineg i32:$src))]>; -def INEG64 : - BasicNVPTXInst<(outs Int64Regs:$dst), (ins Int64Regs:$src), - "neg.s64", - [(set i64:$dst, (ineg i64:$src))]>; +foreach t = [I16RT, I32RT, I64RT] in { + def NEG_S # t.Size : + BasicNVPTXInst<(outs t.RC:$dst), (ins t.RC:$src), + "neg.s" # t.Size, + [(set t.Ty:$dst, (ineg t.Ty:$src))]>; +} //----------------------------------- // Floating Point Arithmetic @@ -1538,7 +1526,7 @@ def bfi : SDNode<"NVPTXISD::BFI", SDTBFI>; def SDTPRMT : SDTypeProfile<1, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, - SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>,]>; + SDTCisVT<2, i32>, SDTCisVT<3, i32>, SDTCisVT<4, i32>]>; def prmt : SDNode<"NVPTXISD::PRMT", SDTPRMT>; multiclass BFE { @@ -1961,7 +1949,7 @@ multiclass FSET_FORMAT { // f16 -> pred def : Pat<(i1 (OpNode f16:$a, f16:$b)), (SETP_f16rr $a, $b, ModeFTZ)>, - Requires<[useFP16Math,doF32FTZ]>; + Requires<[useFP16Math, doF32FTZ]>; def : Pat<(i1 (OpNode f16:$a, f16:$b)), (SETP_f16rr $a, $b, Mode)>, Requires<[useFP16Math]>; @@ -1969,7 +1957,7 @@ multiclass FSET_FORMAT { // bf16 -> pred def : Pat<(i1 (OpNode bf16:$a, bf16:$b)), (SETP_bf16rr $a, $b, ModeFTZ)>, - Requires<[hasBF16Math,doF32FTZ]>; + Requires<[hasBF16Math, doF32FTZ]>; def : Pat<(i1 (OpNode bf16:$a, bf16:$b)), (SETP_bf16rr $a, $b, Mode)>, Requires<[hasBF16Math]>; @@ -2497,24 +2485,20 @@ def : Pat<(f16 (uint_to_fp i32:$a)), (CVT_f16_u32 $a, CvtRN)>; def : Pat<(f16 (uint_to_fp i64:$a)), (CVT_f16_u64 $a, CvtRN)>; // sint -> bf16 -def : Pat<(bf16 (sint_to_fp i1:$a)), (CVT_bf16_s32 (SELP_b32ii 1, 0, $a), CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (sint_to_fp i16:$a)), (CVT_bf16_s16 $a, CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (sint_to_fp i32:$a)), (CVT_bf16_s32 $a, CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (sint_to_fp i64:$a)), (CVT_bf16_s64 $a, CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; +let Predicates = [hasPTX<78>, hasSM<90>] in { + def : Pat<(bf16 (sint_to_fp i1:$a)), (CVT_bf16_s32 (SELP_b32ii 1, 0, $a), CvtRN)>; + def : Pat<(bf16 (sint_to_fp i16:$a)), (CVT_bf16_s16 $a, CvtRN)>; + def : Pat<(bf16 (sint_to_fp i32:$a)), (CVT_bf16_s32 $a, CvtRN)>; + def : Pat<(bf16 (sint_to_fp i64:$a)), (CVT_bf16_s64 $a, CvtRN)>; +} // uint -> bf16 -def : Pat<(bf16 (uint_to_fp i1:$a)), (CVT_bf16_u32 (SELP_b32ii 1, 0, $a), CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (uint_to_fp i16:$a)), (CVT_bf16_u16 $a, CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (uint_to_fp i32:$a)), (CVT_bf16_u32 $a, CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; -def : Pat<(bf16 (uint_to_fp i64:$a)), (CVT_bf16_u64 $a, CvtRN)>, - Requires<[hasPTX<78>, hasSM<90>]>; +let Predicates = [hasPTX<78>, hasSM<90>] in { + def : Pat<(bf16 (uint_to_fp i1:$a)), (CVT_bf16_u32 (SELP_b32ii 1, 0, $a), CvtRN)>; + def : Pat<(bf16 (uint_to_fp i16:$a)), (CVT_bf16_u16 $a, CvtRN)>; + def : Pat<(bf16 (uint_to_fp i32:$a)), (CVT_bf16_u32 $a, CvtRN)>; + def : Pat<(bf16 (uint_to_fp i64:$a)), (CVT_bf16_u64 $a, CvtRN)>; +} // sint -> f32 def : Pat<(f32 (sint_to_fp i1:$a)), (CVT_f32_s32 (SELP_b32ii -1, 0, $a), CvtRN)>; @@ -2565,27 +2549,25 @@ def : Pat<(i16 (fp_to_uint bf16:$a)), (CVT_u16_bf16 $a, CvtRZI)>; def : Pat<(i32 (fp_to_uint bf16:$a)), (CVT_u32_bf16 $a, CvtRZI)>; def : Pat<(i64 (fp_to_uint bf16:$a)), (CVT_u64_bf16 $a, CvtRZI)>; // f32 -> sint -def : Pat<(i1 (fp_to_sint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>; -def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI_FTZ)>, - Requires<[doF32FTZ]>; +let Predicates = [doF32FTZ] in { + def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI_FTZ)>; + def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI_FTZ)>; + def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI_FTZ)>; +} +def : Pat<(i1 (fp_to_sint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>; def : Pat<(i16 (fp_to_sint f32:$a)), (CVT_s16_f32 $a, CvtRZI)>; -def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI_FTZ)>, - Requires<[doF32FTZ]>; def : Pat<(i32 (fp_to_sint f32:$a)), (CVT_s32_f32 $a, CvtRZI)>; -def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI_FTZ)>, - Requires<[doF32FTZ]>; def : Pat<(i64 (fp_to_sint f32:$a)), (CVT_s64_f32 $a, CvtRZI)>; // f32 -> uint +let Predicates = [doF32FTZ] in { + def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI_FTZ)>; + def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI_FTZ)>; + def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI_FTZ)>; +} def : Pat<(i1 (fp_to_uint f32:$a)), (SETP_b32ri $a, 0, CmpEQ)>; -def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI_FTZ)>, - Requires<[doF32FTZ]>; def : Pat<(i16 (fp_to_uint f32:$a)), (CVT_u16_f32 $a, CvtRZI)>; -def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI_FTZ)>, - Requires<[doF32FTZ]>; def : Pat<(i32 (fp_to_uint f32:$a)), (CVT_u32_f32 $a, CvtRZI)>; -def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI_FTZ)>, - Requires<[doF32FTZ]>; def : Pat<(i64 (fp_to_uint f32:$a)), (CVT_u64_f32 $a, CvtRZI)>; // f64 -> sint @@ -2707,28 +2689,24 @@ let hasSideEffects = false in { // PTX 7.1 lets you avoid a temp register and just use _ as a "sink" for the // unused high/low part. - def I32toI16H_Sink : NVPTXInst<(outs Int16Regs:$high), - (ins Int32Regs:$s), - "mov.b32 \t{{_, $high}}, $s;", - []>, Requires<[hasPTX<71>]>; - def I32toI16L_Sink : NVPTXInst<(outs Int16Regs:$low), - (ins Int32Regs:$s), - "mov.b32 \t{{$low, _}}, $s;", - []>, Requires<[hasPTX<71>]>; - def I64toI32H_Sink : NVPTXInst<(outs Int32Regs:$high), - (ins Int64Regs:$s), - "mov.b64 \t{{_, $high}}, $s;", - []>, Requires<[hasPTX<71>]>; - def I64toI32L_Sink : NVPTXInst<(outs Int32Regs:$low), - (ins Int64Regs:$s), - "mov.b64 \t{{$low, _}}, $s;", - []>, Requires<[hasPTX<71>]>; + let Predicates = [hasPTX<71>] in { + def I32toI16H_Sink : NVPTXInst<(outs Int16Regs:$high), (ins Int32Regs:$s), + "mov.b32 \t{{_, $high}}, $s;", []>; + def I32toI16L_Sink : NVPTXInst<(outs Int16Regs:$low), (ins Int32Regs:$s), + "mov.b32 \t{{$low, _}}, $s;", []>; + def I64toI32H_Sink : NVPTXInst<(outs Int32Regs:$high), (ins Int64Regs:$s), + "mov.b64 \t{{_, $high}}, $s;", []>; + def I64toI32L_Sink : NVPTXInst<(outs Int32Regs:$low), (ins Int64Regs:$s), + "mov.b64 \t{{$low, _}}, $s;", []>; + } } -def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>, Requires<[hasPTX<71>]>; -def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>, Requires<[hasPTX<71>]>; -def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>; -def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>, Requires<[hasPTX<71>]>; +let Predicates = [hasPTX<71>] in { + def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>; + def : Pat<(i16 (trunc (sra i32:$s, (i32 16)))), (I32toI16H_Sink i32:$s)>; + def : Pat<(i32 (trunc (srl i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>; + def : Pat<(i32 (trunc (sra i64:$s, (i32 32)))), (I64toI32H_Sink i64:$s)>; +} // Fall back to the old way if we don't have PTX 7.1. def : Pat<(i16 (trunc (srl i32:$s, (i32 16)))), (I32toI16H $s)>; @@ -3061,29 +3039,19 @@ def stacksave : SDNode<"NVPTXISD::STACKSAVE", SDTIntLeaf, [SDNPHasChain, SDNPSideEffect]>; -def STACKRESTORE_32 : - BasicNVPTXInst<(outs), (ins Int32Regs:$ptr), - "stackrestore.u32", - [(stackrestore i32:$ptr)]>, - Requires<[hasPTX<73>, hasSM<52>]>; - -def STACKSAVE_32 : - BasicNVPTXInst<(outs Int32Regs:$dst), (ins), - "stacksave.u32", - [(set i32:$dst, (i32 stacksave))]>, - Requires<[hasPTX<73>, hasSM<52>]>; - -def STACKRESTORE_64 : - BasicNVPTXInst<(outs), (ins Int64Regs:$ptr), - "stackrestore.u64", - [(stackrestore i64:$ptr)]>, - Requires<[hasPTX<73>, hasSM<52>]>; - -def STACKSAVE_64 : - BasicNVPTXInst<(outs Int64Regs:$dst), (ins), - "stacksave.u64", - [(set i64:$dst, (i64 stacksave))]>, - Requires<[hasPTX<73>, hasSM<52>]>; +let Predicates = [hasPTX<73>, hasSM<52>] in { + foreach t = [I32RT, I64RT] in { + def STACKRESTORE_ # t.Size : + BasicNVPTXInst<(outs), (ins t.RC:$ptr), + "stackrestore.u" # t.Size, + [(stackrestore t.Ty:$ptr)]>; + + def STACKSAVE_ # t.Size : + BasicNVPTXInst<(outs t.RC:$dst), (ins), + "stacksave.u" # t.Size, + [(set t.Ty:$dst, (t.Ty stacksave))]>; + } +} include "NVPTXIntrinsics.td" @@ -3124,7 +3092,7 @@ def : Pat < //////////////////////////////////////////////////////////////////////////////// class NVPTXFenceInst: - NVPTXInst<(outs), (ins), "fence."#sem#"."#scope#";", []>, + BasicNVPTXInst<(outs), (ins), "fence."#sem#"."#scope>, Requires<[ptx, hasSM<70>]>; foreach scope = ["sys", "gpu", "cluster", "cta"] in { diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index f918160001ba5..83d7defe6d9a9 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -52,7 +52,7 @@ class PTX { def ptx : PTX; // Generates list of n sequential register names. -// E.g. RegNames<3,"r">.ret -> ["r0", "r1", "r2" ] +// E.g. RegNames<3, "r">.ret -> ["r0", "r1", "r2" ] class RegSeq { list ret = !if(n, !listconcat(RegSeq.ret, [prefix # !sub(n, 1)]), @@ -137,7 +137,7 @@ defm BARRIER_CTA_ARRIVE : BARRIER2<"barrier.arrive", int_nvvm_barrier_cta_arrive class INT_BARRIER_CLUSTER Preds = [hasPTX<78>, hasSM<90>]>: - NVPTXInst<(outs), (ins), "barrier.cluster."# variant #";", [(Intr)]>, + BasicNVPTXInst<(outs), (ins), "barrier.cluster."# variant, [(Intr)]>, Requires; def barrier_cluster_arrive: @@ -400,13 +400,9 @@ def INT_FENCE_PROXY_TENSORMAP_GENERIC_ACQUIRE_SYS : //----------------------------------- multiclass CP_ASYNC_MBARRIER_ARRIVE { - def _32 : NVPTXInst<(outs), (ins Int32Regs:$addr), - !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"), - [(Intrin i32:$addr)]>, - Requires<[hasPTX<70>, hasSM<80>]>; - def _64 : NVPTXInst<(outs), (ins Int64Regs:$addr), - !strconcat("cp.async.mbarrier.arrive", NoInc, AddrSpace, ".b64 [$addr];"), - [(Intrin i64:$addr)]>, + def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr), + "cp.async.mbarrier.arrive" # NoInc # AddrSpace # ".b64", + [(Intrin addr:$addr)]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -420,30 +416,19 @@ defm CP_ASYNC_MBARRIER_ARRIVE_NOINC_SHARED : CP_ASYNC_MBARRIER_ARRIVE<".noinc", ".shared", int_nvvm_cp_async_mbarrier_arrive_noinc_shared>; multiclass CP_ASYNC_SHARED_GLOBAL_I { - def _32 : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"), - [(Intrin i32:$dst, i32:$src)]>, - Requires<[hasPTX<70>, hasSM<80>]>; - def _64 : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ";"), - [(Intrin i64:$dst, i64:$src)]>, + def "" : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src), + "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ";", + [(Intrin addr:$dst, addr:$src)]>, Requires<[hasPTX<70>, hasSM<80>]>; + // Variant with src_size parameter - def _32s : NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, Int32Regs:$src_size), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), - [(IntrinS i32:$dst, i32:$src, i32:$src_size)]>, - Requires<[hasPTX<70>, hasSM<80>]>; - def _32si: NVPTXInst<(outs), (ins Int32Regs:$dst, Int32Regs:$src, i32imm:$src_size), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), - [(IntrinS i32:$dst, i32:$src, imm:$src_size)]>, - Requires<[hasPTX<70>, hasSM<80>]>; - def _64s : NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, Int32Regs:$src_size), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), - [(IntrinS i64:$dst, i64:$src, i32:$src_size)]>, + def _s : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$src_size), + "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ", $src_size;", + [(IntrinS addr:$dst, addr:$src, i32:$src_size)]>, Requires<[hasPTX<70>, hasSM<80>]>; - def _64si: NVPTXInst<(outs), (ins Int64Regs:$dst, Int64Regs:$src, i32imm:$src_size), - !strconcat("cp.async.", cc, ".shared.global [$dst], [$src], ", cpsize, ", $src_size;"), - [(IntrinS i64:$dst, i64:$src, imm:$src_size)]>, + def _si: NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, i32imm:$src_size), + "cp.async." # cc # ".shared.global" # " [$dst], [$src], " # cpsize # ", $src_size;", + [(IntrinS addr:$dst, addr:$src, imm:$src_size)]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -513,14 +498,14 @@ class CpAsyncBulkStr { } multiclass CP_ASYNC_BULK_S2G_INTR { - def NAME : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch), + def "" : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch), !if(has_ch, CpAsyncBulkStr<0, 1>.S2G # " [$dst], [$src], $size, $ch;", CpAsyncBulkStr<0, 0>.S2G # " [$dst], [$src], $size;"), [(int_nvvm_cp_async_bulk_shared_cta_to_global addr:$dst, addr:$src, i32:$size, i64:$ch, !if(has_ch, -1, 0))]>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _BM : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch, Int16Regs:$mask), + def _BM : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$src, Int32Regs:$size, Int64Regs:$ch, Int16Regs:$mask), !if(has_ch, CpAsyncBulkStr<0, 1, 1>.S2G # " [$dst], [$src], $size, $ch, $mask;", CpAsyncBulkStr<0, 0, 1>.S2G # " [$dst], [$src], $size, $mask;"), @@ -533,7 +518,7 @@ defm CP_ASYNC_BULK_S2G_CH : CP_ASYNC_BULK_S2G_INTR; multiclass CP_ASYNC_BULK_G2S_INTR { defvar Intr = int_nvvm_cp_async_bulk_global_to_shared_cluster; - def NAME : NVPTXInst<(outs), + def "" : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$mbar, ADDR:$src, Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch), !if(has_ch, @@ -542,7 +527,7 @@ multiclass CP_ASYNC_BULK_G2S_INTR { [(Intr addr:$dst, addr:$mbar, addr:$src, i32:$size, i16:$mask, i64:$ch, 0, !if(has_ch, -1, 0))]>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _MC : NVPTXInst<(outs), + def _MC : NVPTXInst<(outs), (ins ADDR:$dst, ADDR:$mbar, ADDR:$src, Int32Regs:$size, Int16Regs:$mask, Int64Regs:$ch), !if(has_ch, @@ -561,7 +546,7 @@ def CP_ASYNC_BULK_CTA_TO_CLUSTER : NVPTXInst<(outs), Requires<[hasPTX<80>, hasSM<90>]>; multiclass CP_ASYNC_BULK_PREFETCH_INTR { - def NAME : NVPTXInst<(outs), (ins ADDR:$src, Int32Regs:$size, Int64Regs:$ch), + def "" : NVPTXInst<(outs), (ins ADDR:$src, Int32Regs:$size, Int64Regs:$ch), !if(has_ch, "cp.async.bulk.prefetch.L2.global.L2::cache_hint" # " [$src], $size, $ch;", "cp.async.bulk.prefetch.L2.global" # " [$src], $size;"), @@ -609,19 +594,19 @@ multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR defvar asm_str = !if(!eq(mode, "im2col"), !strconcat(asm_str_default, im2col_asm_str), asm_str_default); - def NAME: NVPTXInst<(outs), + def "" : NVPTXInst<(outs), !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag), !strconcat(G2S_STRINGS.inst_name, asm_str, ";"), []>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _MC: NVPTXInst<(outs), + def _MC : NVPTXInst<(outs), !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc)), !strconcat(G2S_STRINGS.inst_name, asm_str, ", $mc;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _CH: NVPTXInst<(outs), + def _CH : NVPTXInst<(outs), !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)), !strconcat(G2S_STRINGS.inst_name, asm_str, ", $ch;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _MC_CH: NVPTXInst<(outs), + def _MC_CH : NVPTXInst<(outs), !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc, Int64Regs:$ch)), !strconcat(G2S_STRINGS.inst_name, asm_str, ", $mc, $ch;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; @@ -661,11 +646,11 @@ multiclass CP_ASYNC_BULK_TENSOR_S2G_INTR { defvar asm_str = " [$tmap, {{" # dims_str # "}}], [$src]"; defvar rc = !if(shared32, Int32Regs, Int64Regs); - def NAME: NVPTXInst<(outs), + def "" : NVPTXInst<(outs), !con((ins rc:$src, Int64Regs:$tmap), dims_dag), !strconcat(S2G_STRINGS.inst_name, asm_str, ";"), []>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _CH: NVPTXInst<(outs), + def _CH : NVPTXInst<(outs), !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins Int64Regs:$ch)), !strconcat(S2G_STRINGS.inst_name, asm_str, ", $ch;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; @@ -685,11 +670,11 @@ multiclass CP_ASYNC_BULK_TENSOR_REDUCE_INTR defvar prefix = "cp.reduce.async.bulk.tensor" # "." # dim # "d" # ".global.shared::cta"; defvar suffix = "." # mode # ".bulk_group"; - def NAME: NVPTXInst<(outs), + def "" : NVPTXInst<(outs), !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins TMAReductionFlags:$red_op)), !strconcat(prefix, "${red_op}", suffix, asm_str, ";"), []>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _CH: NVPTXInst<(outs), + def _CH : NVPTXInst<(outs), !con((ins rc:$src, Int64Regs:$tmap), dims_dag, (ins Int64Regs:$ch, TMAReductionFlags:$red_op)), !strconcat(prefix, "${red_op}", suffix, ".L2::cache_hint", asm_str, ", $ch;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; @@ -735,11 +720,11 @@ multiclass CP_ASYNC_BULK_TENSOR_PREFETCH_INTR { defvar asm_str = !if(!eq(mode, "im2col"), !strconcat(asm_str_default, im2col_asm_str), asm_str_default); - def NAME: NVPTXInst<(outs), + def "" : NVPTXInst<(outs), !con((ins Int64Regs:$tmap), dims_dag, im2col_dag), !strconcat(PREFETCH_STRINGS.inst_name, asm_str, ";"), []>, Requires<[hasPTX<80>, hasSM<90>]>; - def NAME # _CH: NVPTXInst<(outs), + def _CH : NVPTXInst<(outs), !con((ins Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)), !strconcat(PREFETCH_STRINGS.inst_name, asm_str, ", $ch;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; @@ -755,10 +740,10 @@ foreach dim = [1, 2, 3, 4, 5] in { //Prefetch and Prefetchu class PREFETCH_INTRS : - NVPTXInst<(outs), (ins Int64Regs:$addr), - InstName # " [$addr];", + BasicNVPTXInst<(outs), (ins ADDR:$addr), + InstName, [(!cast(!strconcat("int_nvvm_", - !subst(".", "_", InstName))) i64:$addr)]>, + !subst(".", "_", InstName))) addr:$addr)]>, Requires<[hasPTX<80>, hasSM<90>]>; @@ -769,36 +754,39 @@ def PREFETCH_LOCAL_L1 : PREFETCH_INTRS<"prefetch.local.L1">; def PREFETCH_GLOBAL_L2 : PREFETCH_INTRS<"prefetch.global.L2">; def PREFETCH_LOCAL_L2 : PREFETCH_INTRS<"prefetch.local.L2">; -def PREFETCH_GLOBAL_L2_EVICT_NORMAL : NVPTXInst<(outs), (ins Int64Regs:$addr), - "prefetch.global.L2::evict_normal" # " [$addr];", - [(!cast("int_nvvm_prefetch_global_L2_evict_normal") i64:$addr)]>, +def PREFETCH_GLOBAL_L2_EVICT_NORMAL : BasicNVPTXInst<(outs), (ins ADDR:$addr), + "prefetch.global.L2::evict_normal", + [(int_nvvm_prefetch_global_L2_evict_normal addr:$addr)]>, Requires<[hasPTX<80>, hasSM<90>]>; -def PREFETCH_GLOBAL_L2_EVICT_LAST : NVPTXInst<(outs), (ins Int64Regs:$addr), - "prefetch.global.L2::evict_last" # " [$addr];", - [(!cast("int_nvvm_prefetch_global_L2_evict_last") i64:$addr)]>, +def PREFETCH_GLOBAL_L2_EVICT_LAST : BasicNVPTXInst<(outs), (ins ADDR:$addr), + "prefetch.global.L2::evict_last", + [(int_nvvm_prefetch_global_L2_evict_last addr:$addr)]>, Requires<[hasPTX<80>, hasSM<90>]>; def PREFETCHU_L1 : PREFETCH_INTRS<"prefetchu.L1">; //Applypriority intrinsics -class APPLYPRIORITY_L2_INTRS : - NVPTXInst<(outs), (ins Int64Regs:$addr, Int64Regs:$size), - StrJoin<".", ["applypriority", addr , "L2::evict_normal"]>.ret # " [$addr], $size;", - [(!cast(StrJoin<"_", ["int_nvvm_applypriority", addr , "L2_evict_normal"]>.ret) - i64:$addr, i64:$size)]>, +class APPLYPRIORITY_L2_INTRS : + BasicNVPTXInst<(outs), (ins ADDR:$addr, Int64Regs:$size), + StrJoin<".", ["applypriority", addrspace , "L2::evict_normal"]>.ret, + [(!cast(StrJoin<"_", ["int_nvvm_applypriority", addrspace , "L2_evict_normal"]>.ret) + addr:$addr, i64:$size)]>, Requires<[hasPTX<74>, hasSM<80>]>; def APPLYPRIORITY_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"">; def APPLYPRIORITY_GLOBAL_L2_EVICT_NORMAL : APPLYPRIORITY_L2_INTRS<"global">; //Discard Intrinsics -class DISCARD_L2_INTRS : - NVPTXInst<(outs), (ins Int64Regs:$addr), - StrJoin<".", ["discard", Addr , "L2"]>.ret # " [$addr], 128;", - [(!cast(StrJoin<"_", ["int_nvvm_discard", Addr , "L2"]>.ret) - i64:$addr, (i64 128))]>, + +def discard_size_imm : TImmLeaf; + +class DISCARD_L2_INTRS : + BasicNVPTXInst<(outs), (ins ADDR:$addr, i64imm:$size), + StrJoin<".", ["discard", addrspace , "L2"]>.ret, + [(!cast(StrJoin<"_", ["int_nvvm_discard", addrspace , "L2"]>.ret) + addr:$addr, discard_size_imm:$size)]>, Requires<[hasPTX<74>, hasSM<80>]>; def DISCARD_L2 : DISCARD_L2_INTRS<"">; @@ -809,8 +797,8 @@ def DISCARD_GLOBAL_L2 : DISCARD_L2_INTRS<"global">; //----------------------------------- multiclass MBARRIER_INIT { - def "" : NVPTXInst<(outs), (ins ADDR:$addr, Int32Regs:$count), - "mbarrier.init" # AddrSpace # ".b64 [$addr], $count;", + def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr, Int32Regs:$count), + "mbarrier.init" # AddrSpace # ".b64", [(Intrin addr:$addr, i32:$count)]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -820,8 +808,8 @@ defm MBARRIER_INIT_SHARED : MBARRIER_INIT<".shared", int_nvvm_mbarrier_init_shared>; multiclass MBARRIER_INVAL { - def "" : NVPTXInst<(outs), (ins ADDR:$addr), - "mbarrier.inval" # AddrSpace # ".b64 [$addr];", + def "" : BasicNVPTXInst<(outs), (ins ADDR:$addr), + "mbarrier.inval" # AddrSpace # ".b64", [(Intrin addr:$addr)]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -831,8 +819,8 @@ defm MBARRIER_INVAL_SHARED : MBARRIER_INVAL<".shared", int_nvvm_mbarrier_inval_shared>; multiclass MBARRIER_ARRIVE { - def "" : NVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr), - "mbarrier.arrive" # AddrSpace # ".b64 $state, [$addr];", + def "" : BasicNVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr), + "mbarrier.arrive" # AddrSpace # ".b64", [(set i64:$state, (Intrin addr:$addr))]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -842,9 +830,9 @@ defm MBARRIER_ARRIVE_SHARED : MBARRIER_ARRIVE<".shared", int_nvvm_mbarrier_arrive_shared>; multiclass MBARRIER_ARRIVE_NOCOMPLETE { - def "" : NVPTXInst<(outs Int64Regs:$state), + def "" : BasicNVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr, Int32Regs:$count), - "mbarrier.arrive.noComplete" # AddrSpace # ".b64 $state, [$addr], $count;", + "mbarrier.arrive.noComplete" # AddrSpace # ".b64", [(set i64:$state, (Intrin addr:$addr, i32:$count))]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -855,8 +843,8 @@ defm MBARRIER_ARRIVE_NOCOMPLETE_SHARED : MBARRIER_ARRIVE_NOCOMPLETE<".shared", int_nvvm_mbarrier_arrive_noComplete_shared>; multiclass MBARRIER_ARRIVE_DROP { - def "" : NVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr), - "mbarrier.arrive_drop" # AddrSpace # ".b64 $state, [$addr];", + def "" : BasicNVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr), + "mbarrier.arrive_drop" # AddrSpace # ".b64", [(set i64:$state, (Intrin addr:$addr))]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -867,9 +855,9 @@ defm MBARRIER_ARRIVE_DROP_SHARED : MBARRIER_ARRIVE_DROP<".shared", int_nvvm_mbarrier_arrive_drop_shared>; multiclass MBARRIER_ARRIVE_DROP_NOCOMPLETE { - def "" : NVPTXInst<(outs Int64Regs:$state), + def "" : BasicNVPTXInst<(outs Int64Regs:$state), (ins ADDR:$addr, Int32Regs:$count), - "mbarrier.arrive_drop.noComplete" # AddrSpace # ".b64 $state, [$addr], $count;", + "mbarrier.arrive_drop.noComplete" # AddrSpace # ".b64", [(set i64:$state, (Intrin addr:$addr, i32:$count))]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -881,8 +869,8 @@ defm MBARRIER_ARRIVE_DROP_NOCOMPLETE_SHARED : int_nvvm_mbarrier_arrive_drop_noComplete_shared>; multiclass MBARRIER_TEST_WAIT { - def "" : NVPTXInst<(outs Int1Regs:$res), (ins ADDR:$addr, Int64Regs:$state), - "mbarrier.test_wait" # AddrSpace # ".b64 $res, [$addr], $state;", + def "" : BasicNVPTXInst<(outs Int1Regs:$res), (ins ADDR:$addr, Int64Regs:$state), + "mbarrier.test_wait" # AddrSpace # ".b64", [(set i1:$res, (Intrin addr:$addr, i64:$state))]>, Requires<[hasPTX<70>, hasSM<80>]>; } @@ -1790,93 +1778,74 @@ def : Pat<(int_nvvm_ff_to_e5m2x2_rn f32:$a, f32:$b), def : Pat<(int_nvvm_ff_to_e5m2x2_rn_relu f32:$a, f32:$b), (CVT_e5m2x2_f32 $a, $b, CvtRN_RELU)>; -def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn Int32Regs:$a), +def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn v2f16:$a), (CVT_e4m3x2_f16x2 $a, CvtRN)>; -def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu Int32Regs:$a), +def : Pat<(int_nvvm_f16x2_to_e4m3x2_rn_relu v2f16:$a), (CVT_e4m3x2_f16x2 $a, CvtRN_RELU)>; -def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn Int32Regs:$a), +def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn v2f16:$a), (CVT_e5m2x2_f16x2 $a, CvtRN)>; -def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu Int32Regs:$a), +def : Pat<(int_nvvm_f16x2_to_e5m2x2_rn_relu v2f16:$a), (CVT_e5m2x2_f16x2 $a, CvtRN_RELU)>; -def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn Int16Regs:$a), +def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn i16:$a), (CVT_f16x2_e4m3x2 $a, CvtRN)>; -def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu Int16Regs:$a), +def : Pat<(int_nvvm_e4m3x2_to_f16x2_rn_relu i16:$a), (CVT_f16x2_e4m3x2 $a, CvtRN_RELU)>; -def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn Int16Regs:$a), +def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn i16:$a), (CVT_f16x2_e5m2x2 $a, CvtRN)>; -def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu Int16Regs:$a), +def : Pat<(int_nvvm_e5m2x2_to_f16x2_rn_relu i16:$a), (CVT_f16x2_e5m2x2 $a, CvtRN_RELU)>; -def : Pat<(int_nvvm_ff_to_e2m3x2_rn_satfinite f32:$a, f32:$b), - (CVT_e2m3x2_f32_sf $a, $b, CvtRN)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_ff_to_e2m3x2_rn_relu_satfinite f32:$a, f32:$b), - (CVT_e2m3x2_f32_sf $a, $b, CvtRN_RELU)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_ff_to_e3m2x2_rn_satfinite f32:$a, f32:$b), - (CVT_e3m2x2_f32_sf $a, $b, CvtRN)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_ff_to_e3m2x2_rn_relu_satfinite f32:$a, f32:$b), - (CVT_e3m2x2_f32_sf $a, $b, CvtRN_RELU)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; - -def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn i16:$a), - (CVT_f16x2_e2m3x2 $a, CvtRN)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn_relu i16:$a), - (CVT_f16x2_e2m3x2 $a, CvtRN_RELU)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn i16:$a), - (CVT_f16x2_e3m2x2 $a, CvtRN)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn_relu i16:$a), - (CVT_f16x2_e3m2x2 $a, CvtRN_RELU)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; - -def : Pat<(int_nvvm_ff_to_e2m1x2_rn_satfinite f32:$a, f32:$b), - (CVT_e2m1x2_f32_sf $a, $b, CvtRN)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_ff_to_e2m1x2_rn_relu_satfinite f32:$a, f32:$b), - (CVT_e2m1x2_f32_sf $a, $b, CvtRN_RELU)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; - -def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn Int16Regs:$a), - (CVT_f16x2_e2m1x2 $a, CvtRN)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn_relu Int16Regs:$a), - (CVT_f16x2_e2m1x2 $a, CvtRN_RELU)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; - -def : Pat<(int_nvvm_ff_to_ue8m0x2_rz f32:$a, f32:$b), - (CVT_ue8m0x2_f32 $a, $b, CvtRZ)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_ff_to_ue8m0x2_rz_satfinite f32:$a, f32:$b), - (CVT_ue8m0x2_f32_sf $a, $b, CvtRZ)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_ff_to_ue8m0x2_rp f32:$a, f32:$b), - (CVT_ue8m0x2_f32 $a, $b, CvtRP)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_ff_to_ue8m0x2_rp_satfinite f32:$a, f32:$b), - (CVT_ue8m0x2_f32_sf $a, $b, CvtRP)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; - -def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz Int32Regs:$a), - (CVT_ue8m0x2_bf16x2 $a, CvtRZ)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz_satfinite Int32Regs:$a), - (CVT_ue8m0x2_bf16x2_sf $a, CvtRZ)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp Int32Regs:$a), - (CVT_ue8m0x2_bf16x2 $a, CvtRP)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; -def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp_satfinite Int32Regs:$a), - (CVT_ue8m0x2_bf16x2_sf $a, CvtRP)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; - -def : Pat<(int_nvvm_ue8m0x2_to_bf16x2 i16:$a), - (CVT_bf16x2_ue8m0x2 $a)>, - Requires<[hasPTX<86>, hasSM<100>, hasArchAccelFeatures]>; +let Predicates = [hasPTX<86>, hasSM<100>, hasArchAccelFeatures] in { + def : Pat<(int_nvvm_ff_to_e2m3x2_rn_satfinite f32:$a, f32:$b), + (CVT_e2m3x2_f32_sf $a, $b, CvtRN)>; + def : Pat<(int_nvvm_ff_to_e2m3x2_rn_relu_satfinite f32:$a, f32:$b), + (CVT_e2m3x2_f32_sf $a, $b, CvtRN_RELU)>; + def : Pat<(int_nvvm_ff_to_e3m2x2_rn_satfinite f32:$a, f32:$b), + (CVT_e3m2x2_f32_sf $a, $b, CvtRN)>; + def : Pat<(int_nvvm_ff_to_e3m2x2_rn_relu_satfinite f32:$a, f32:$b), + (CVT_e3m2x2_f32_sf $a, $b, CvtRN_RELU)>; + + def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn i16:$a), + (CVT_f16x2_e2m3x2 $a, CvtRN)>; + def : Pat<(int_nvvm_e2m3x2_to_f16x2_rn_relu i16:$a), + (CVT_f16x2_e2m3x2 $a, CvtRN_RELU)>; + def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn i16:$a), + (CVT_f16x2_e3m2x2 $a, CvtRN)>; + def : Pat<(int_nvvm_e3m2x2_to_f16x2_rn_relu i16:$a), + (CVT_f16x2_e3m2x2 $a, CvtRN_RELU)>; + + def : Pat<(int_nvvm_ff_to_e2m1x2_rn_satfinite f32:$a, f32:$b), + (CVT_e2m1x2_f32_sf $a, $b, CvtRN)>; + def : Pat<(int_nvvm_ff_to_e2m1x2_rn_relu_satfinite f32:$a, f32:$b), + (CVT_e2m1x2_f32_sf $a, $b, CvtRN_RELU)>; + + def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn i16:$a), + (CVT_f16x2_e2m1x2 $a, CvtRN)>; + def : Pat<(int_nvvm_e2m1x2_to_f16x2_rn_relu i16:$a), + (CVT_f16x2_e2m1x2 $a, CvtRN_RELU)>; + + def : Pat<(int_nvvm_ff_to_ue8m0x2_rz f32:$a, f32:$b), + (CVT_ue8m0x2_f32 $a, $b, CvtRZ)>; + def : Pat<(int_nvvm_ff_to_ue8m0x2_rz_satfinite f32:$a, f32:$b), + (CVT_ue8m0x2_f32_sf $a, $b, CvtRZ)>; + def : Pat<(int_nvvm_ff_to_ue8m0x2_rp f32:$a, f32:$b), + (CVT_ue8m0x2_f32 $a, $b, CvtRP)>; + def : Pat<(int_nvvm_ff_to_ue8m0x2_rp_satfinite f32:$a, f32:$b), + (CVT_ue8m0x2_f32_sf $a, $b, CvtRP)>; + + def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz v2bf16:$a), + (CVT_ue8m0x2_bf16x2 $a, CvtRZ)>; + def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rz_satfinite v2bf16:$a), + (CVT_ue8m0x2_bf16x2_sf $a, CvtRZ)>; + def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp v2bf16:$a), + (CVT_ue8m0x2_bf16x2 $a, CvtRP)>; + def : Pat<(int_nvvm_bf16x2_to_ue8m0x2_rp_satfinite v2bf16:$a), + (CVT_ue8m0x2_bf16x2_sf $a, CvtRP)>; + + def : Pat<(int_nvvm_ue8m0x2_to_bf16x2 i16:$a), + (CVT_bf16x2_ue8m0x2 $a)>; +} // // FNS @@ -1920,14 +1889,14 @@ class ATOMIC_GENERIC_CHK multiclass F_ATOMIC_2 preds> { - defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b;"; + defvar asm_str = "atom" # sem_str # as_str # "." # op_str; let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def r : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b), + def r : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b), asm_str, [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b))]>, Requires; if t.SupportsImm then - def i : NVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b), + def i : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b), asm_str, [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b)))]>, Requires; @@ -1937,27 +1906,27 @@ multiclass F_ATOMIC_2 preds> { - defvar asm_str = "atom" # sem_str # as_str # "." # op_str # " \t$dst, [$addr], $b, $c;"; + defvar asm_str = "atom" # sem_str # as_str # "." # op_str; let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in { - def rr : NVPTXInst<(outs t.RC:$dst), + def rr : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b, t.RC:$c), asm_str, [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, t.Ty:$c))]>, Requires; - def ir : NVPTXInst<(outs t.RC:$dst), + def ir : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b, t.RC:$c), asm_str, [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), t.Ty:$c))]>, Requires; - def ri : NVPTXInst<(outs t.RC:$dst), + def ri : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.RC:$b, t.Imm:$c), asm_str, [(set t.Ty:$dst, (op addr:$addr, t.Ty:$b, (t.Ty t.ImmNode:$c)))]>, Requires; - def ii : NVPTXInst<(outs t.RC:$dst), + def ii : BasicNVPTXInst<(outs t.RC:$dst), (ins ADDR:$addr, t.Imm:$b, t.Imm:$c), asm_str, [(set t.Ty:$dst, (op addr:$addr, (t.Ty t.ImmNode:$b), (t.Ty t.ImmNode:$c)))]>, @@ -2100,7 +2069,7 @@ multiclass ATOM3S_impl; + t, !listconcat(Preds, [hasAtomScope])>; } } } @@ -4454,1956 +4423,616 @@ defm SULD_3D_V4I32_ZERO : SULD_3D_V4<"suld.b.3d.v4.b32.zero", Int32Regs>; //----------------------------------- let IsSurfTexQuery = true in { -def TXQ_CHANNEL_ORDER_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.channel_order.b32 \t$d, [$a];", - []>; -def TXQ_CHANNEL_ORDER_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.channel_order.b32 \t$d, [$a];", - []>; -def TXQ_CHANNEL_DATA_TYPE_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.channel_data_type.b32 \t$d, [$a];", - []>; -def TXQ_CHANNEL_DATA_TYPE_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.channel_data_type.b32 \t$d, [$a];", - []>; -def TXQ_WIDTH_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.width.b32 \t$d, [$a];", - []>; -def TXQ_WIDTH_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.width.b32 \t$d, [$a];", - []>; -def TXQ_HEIGHT_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.height.b32 \t$d, [$a];", - []>; -def TXQ_HEIGHT_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.height.b32 \t$d, [$a];", - []>; -def TXQ_DEPTH_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.depth.b32 \t$d, [$a];", - []>; -def TXQ_DEPTH_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.depth.b32 \t$d, [$a];", - []>; -def TXQ_ARRAY_SIZE_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.array_size.b32 \t$d, [$a];", - []>; -def TXQ_ARRAY_SIZE_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.array_size.b32 \t$d, [$a];", - []>; -def TXQ_NUM_SAMPLES_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.num_samples.b32 \t$d, [$a];", - []>; -def TXQ_NUM_SAMPLES_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.num_samples.b32 \t$d, [$a];", - []>; -def TXQ_NUM_MIPMAP_LEVELS_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "txq.num_mipmap_levels.b32 \t$d, [$a];", - []>; -def TXQ_NUM_MIPMAP_LEVELS_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "txq.num_mipmap_levels.b32 \t$d, [$a];", - []>; -} - -def : Pat<(int_nvvm_txq_channel_order i64:$a), - (TXQ_CHANNEL_ORDER_R $a)>; -def : Pat<(int_nvvm_txq_channel_data_type i64:$a), - (TXQ_CHANNEL_DATA_TYPE_R $a)>; -def : Pat<(int_nvvm_txq_width i64:$a), - (TXQ_WIDTH_R $a)>; -def : Pat<(int_nvvm_txq_height i64:$a), - (TXQ_HEIGHT_R $a)>; -def : Pat<(int_nvvm_txq_depth i64:$a), - (TXQ_DEPTH_R $a)>; -def : Pat<(int_nvvm_txq_array_size i64:$a), - (TXQ_ARRAY_SIZE_R $a)>; -def : Pat<(int_nvvm_txq_num_samples i64:$a), - (TXQ_NUM_SAMPLES_R $a)>; -def : Pat<(int_nvvm_txq_num_mipmap_levels i64:$a), - (TXQ_NUM_MIPMAP_LEVELS_R $a)>; - + foreach query = ["channel_order", "channel_data_type", "width", "height", + "depth", "array_size", "num_samples", "num_mipmap_levels"] in { + def TXQ_ # !toupper(query) # _R + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "txq." # query # ".b32 \t$d, [$a];", + [(set i32:$d, (!cast("int_nvvm_txq_" # query) i64:$a))]>; + def TXQ_ # !toupper(query) # _I + : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), + "txq." # query # ".b32 \t$d, [$a];", + []>; + } +} //----------------------------------- // Surface Query Intrinsics //----------------------------------- let IsSurfTexQuery = true in { -def SUQ_CHANNEL_ORDER_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.channel_order.b32 \t$d, [$a];", - []>; -def SUQ_CHANNEL_ORDER_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "suq.channel_order.b32 \t$d, [$a];", - []>; -def SUQ_CHANNEL_DATA_TYPE_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.channel_data_type.b32 \t$d, [$a];", - []>; -def SUQ_CHANNEL_DATA_TYPE_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "suq.channel_data_type.b32 \t$d, [$a];", - []>; -def SUQ_WIDTH_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.width.b32 \t$d, [$a];", - []>; -def SUQ_WIDTH_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "suq.width.b32 \t$d, [$a];", - []>; -def SUQ_HEIGHT_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.height.b32 \t$d, [$a];", - []>; -def SUQ_HEIGHT_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "suq.height.b32 \t$d, [$a];", - []>; -def SUQ_DEPTH_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.depth.b32 \t$d, [$a];", - []>; -def SUQ_DEPTH_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "suq.depth.b32 \t$d, [$a];", - []>; -def SUQ_ARRAY_SIZE_R - : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), - "suq.array_size.b32 \t$d, [$a];", - []>; -def SUQ_ARRAY_SIZE_I - : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), - "suq.array_size.b32 \t$d, [$a];", - []>; -} - -def : Pat<(int_nvvm_suq_channel_order i64:$a), - (SUQ_CHANNEL_ORDER_R $a)>; -def : Pat<(int_nvvm_suq_channel_data_type i64:$a), - (SUQ_CHANNEL_DATA_TYPE_R $a)>; -def : Pat<(int_nvvm_suq_width i64:$a), - (SUQ_WIDTH_R $a)>; -def : Pat<(int_nvvm_suq_height i64:$a), - (SUQ_HEIGHT_R $a)>; -def : Pat<(int_nvvm_suq_depth i64:$a), - (SUQ_DEPTH_R $a)>; -def : Pat<(int_nvvm_suq_array_size i64:$a), - (SUQ_ARRAY_SIZE_R $a)>; - + foreach query = ["channel_order", "channel_data_type", "width", "height", "depth", "array_size"] in { + def SUQ_ # !toupper(query) # _R + : NVPTXInst<(outs Int32Regs:$d), (ins Int64Regs:$a), + "suq." # query # ".b32 \t$d, [$a];", + [(set i32:$d, (!cast("int_nvvm_suq_" # query) i64:$a))]>; + def SUQ_ # !toupper(query) # _I + : NVPTXInst<(outs Int32Regs:$d), (ins i64imm:$a), + "suq." # query # ".b32 \t$d, [$a];", + []>; + } +} //===- Handle Query -------------------------------------------------------===// // TODO: These intrinsics are not yet finalized, pending PTX ISA design work def ISTYPEP_SAMPLER - : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), - "istypep.samplerref \t$d, $a;", + : BasicNVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "istypep.samplerref", [(set i1:$d, (int_nvvm_istypep_sampler i64:$a))]>; def ISTYPEP_SURFACE - : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), - "istypep.surfref \t$d, $a;", + : BasicNVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "istypep.surfref", [(set i1:$d, (int_nvvm_istypep_surface i64:$a))]>; def ISTYPEP_TEXTURE - : NVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), - "istypep.texref \t$d, $a;", + : BasicNVPTXInst<(outs Int1Regs:$d), (ins Int64Regs:$a), + "istypep.texref", [(set i1:$d, (int_nvvm_istypep_texture i64:$a))]>; //===- Surface Stores -----------------------------------------------------===// let IsSust = true in { -class SUST_1D_base +class SUST_1D_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, intype:$r)), - inst # " \t[$s, \\{$x\\}], \\{$r\\};", - []>; + inst # " \t[$s, \\{$x\\}], \\{$r\\};", pat>; multiclass SUST_1D { - def _R : SUST_1D_base; - def _I : SUST_1D_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + + def _R : SUST_1D_base; + def _I : SUST_1D_base; } -defm SUST_B_1D_B8_CLAMP : SUST_1D<"sust.b.1d.b8.clamp", Int16Regs>; -defm SUST_B_1D_B16_CLAMP : SUST_1D<"sust.b.1d.b16.clamp", Int16Regs>; -defm SUST_B_1D_B32_CLAMP : SUST_1D<"sust.b.1d.b32.clamp", Int32Regs>; -defm SUST_B_1D_B64_CLAMP : SUST_1D<"sust.b.1d.b64.clamp", Int64Regs>; +defm SUST_B_1D_I8_CLAMP : SUST_1D<"sust.b.1d.b8.clamp", Int16Regs>; +defm SUST_B_1D_I16_CLAMP : SUST_1D<"sust.b.1d.b16.clamp", Int16Regs>; +defm SUST_B_1D_I32_CLAMP : SUST_1D<"sust.b.1d.b32.clamp", Int32Regs>; +defm SUST_B_1D_I64_CLAMP : SUST_1D<"sust.b.1d.b64.clamp", Int64Regs>; -defm SUST_B_1D_B8_TRAP : SUST_1D<"sust.b.1d.b8.trap", Int16Regs>; -defm SUST_B_1D_B16_TRAP : SUST_1D<"sust.b.1d.b16.trap", Int16Regs>; -defm SUST_B_1D_B32_TRAP : SUST_1D<"sust.b.1d.b32.trap", Int32Regs>; -defm SUST_B_1D_B64_TRAP : SUST_1D<"sust.b.1d.b64.trap", Int64Regs>; +defm SUST_B_1D_I8_TRAP : SUST_1D<"sust.b.1d.b8.trap", Int16Regs>; +defm SUST_B_1D_I16_TRAP : SUST_1D<"sust.b.1d.b16.trap", Int16Regs>; +defm SUST_B_1D_I32_TRAP : SUST_1D<"sust.b.1d.b32.trap", Int32Regs>; +defm SUST_B_1D_I64_TRAP : SUST_1D<"sust.b.1d.b64.trap", Int64Regs>; -defm SUST_B_1D_B8_ZERO : SUST_1D<"sust.b.1d.b8.zero", Int16Regs>; -defm SUST_B_1D_B16_ZERO : SUST_1D<"sust.b.1d.b16.zero", Int16Regs>; -defm SUST_B_1D_B32_ZERO : SUST_1D<"sust.b.1d.b32.zero", Int32Regs>; -defm SUST_B_1D_B64_ZERO : SUST_1D<"sust.b.1d.b64.zero", Int64Regs>; +defm SUST_B_1D_I8_ZERO : SUST_1D<"sust.b.1d.b8.zero", Int16Regs>; +defm SUST_B_1D_I16_ZERO : SUST_1D<"sust.b.1d.b16.zero", Int16Regs>; +defm SUST_B_1D_I32_ZERO : SUST_1D<"sust.b.1d.b32.zero", Int32Regs>; +defm SUST_B_1D_I64_ZERO : SUST_1D<"sust.b.1d.b64.zero", Int64Regs>; -defm SUST_P_1D_B8_TRAP : SUST_1D<"sust.p.1d.b8.trap", Int16Regs>; -defm SUST_P_1D_B16_TRAP : SUST_1D<"sust.p.1d.b16.trap", Int16Regs>; -defm SUST_P_1D_B32_TRAP : SUST_1D<"sust.p.1d.b32.trap", Int32Regs>; +defm SUST_P_1D_I8_TRAP : SUST_1D<"sust.p.1d.b8.trap", Int16Regs>; +defm SUST_P_1D_I16_TRAP : SUST_1D<"sust.p.1d.b16.trap", Int16Regs>; +defm SUST_P_1D_I32_TRAP : SUST_1D<"sust.p.1d.b32.trap", Int32Regs>; -class SUST_1D_V2_base +class SUST_1D_V2_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, intype:$r, intype:$g)), inst # " \t[$s, \\{$x\\}], \\{$r, $g\\};", - []>; + pat>; multiclass SUST_1D_V2 { - def _R : SUST_1D_V2_base; - def _I : SUST_1D_V2_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_1D_V2_base; + def _I : SUST_1D_V2_base; } -defm SUST_B_1D_V2B8_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b8.clamp", Int16Regs>; -defm SUST_B_1D_V2B16_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b16.clamp", Int16Regs>; -defm SUST_B_1D_V2B32_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b32.clamp", Int32Regs>; -defm SUST_B_1D_V2B64_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b64.clamp", Int64Regs>; +// int_nvvm_sust_b_1d_v2i8_clamp -defm SUST_B_1D_V2B8_TRAP : SUST_1D_V2<"sust.b.1d.v2.b8.trap", Int16Regs>; -defm SUST_B_1D_V2B16_TRAP : SUST_1D_V2<"sust.b.1d.v2.b16.trap", Int16Regs>; -defm SUST_B_1D_V2B32_TRAP : SUST_1D_V2<"sust.b.1d.v2.b32.trap", Int32Regs>; -defm SUST_B_1D_V2B64_TRAP : SUST_1D_V2<"sust.b.1d.v2.b64.trap", Int64Regs>; +defm SUST_B_1D_V2I8_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b8.clamp", Int16Regs>; +defm SUST_B_1D_V2I16_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b16.clamp", Int16Regs>; +defm SUST_B_1D_V2I32_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b32.clamp", Int32Regs>; +defm SUST_B_1D_V2I64_CLAMP : SUST_1D_V2<"sust.b.1d.v2.b64.clamp", Int64Regs>; -defm SUST_B_1D_V2B8_ZERO : SUST_1D_V2<"sust.b.1d.v2.b8.zero", Int16Regs>; -defm SUST_B_1D_V2B16_ZERO : SUST_1D_V2<"sust.b.1d.v2.b16.zero", Int16Regs>; -defm SUST_B_1D_V2B32_ZERO : SUST_1D_V2<"sust.b.1d.v2.b32.zero", Int32Regs>; -defm SUST_B_1D_V2B64_ZERO : SUST_1D_V2<"sust.b.1d.v2.b64.zero", Int64Regs>; +defm SUST_B_1D_V2I8_TRAP : SUST_1D_V2<"sust.b.1d.v2.b8.trap", Int16Regs>; +defm SUST_B_1D_V2I16_TRAP : SUST_1D_V2<"sust.b.1d.v2.b16.trap", Int16Regs>; +defm SUST_B_1D_V2I32_TRAP : SUST_1D_V2<"sust.b.1d.v2.b32.trap", Int32Regs>; +defm SUST_B_1D_V2I64_TRAP : SUST_1D_V2<"sust.b.1d.v2.b64.trap", Int64Regs>; -defm SUST_P_1D_V2B8_TRAP : SUST_1D_V2<"sust.p.1d.v2.b8.trap", Int16Regs>; -defm SUST_P_1D_V2B16_TRAP : SUST_1D_V2<"sust.p.1d.v2.b16.trap", Int16Regs>; -defm SUST_P_1D_V2B32_TRAP : SUST_1D_V2<"sust.p.1d.v2.b32.trap", Int32Regs>; +defm SUST_B_1D_V2I8_ZERO : SUST_1D_V2<"sust.b.1d.v2.b8.zero", Int16Regs>; +defm SUST_B_1D_V2I16_ZERO : SUST_1D_V2<"sust.b.1d.v2.b16.zero", Int16Regs>; +defm SUST_B_1D_V2I32_ZERO : SUST_1D_V2<"sust.b.1d.v2.b32.zero", Int32Regs>; +defm SUST_B_1D_V2I64_ZERO : SUST_1D_V2<"sust.b.1d.v2.b64.zero", Int64Regs>; -class SUST_1D_V4_base +defm SUST_P_1D_V2I8_TRAP : SUST_1D_V2<"sust.p.1d.v2.b8.trap", Int16Regs>; +defm SUST_P_1D_V2I16_TRAP : SUST_1D_V2<"sust.p.1d.v2.b16.trap", Int16Regs>; +defm SUST_P_1D_V2I32_TRAP : SUST_1D_V2<"sust.p.1d.v2.b32.trap", Int32Regs>; + +class SUST_1D_V4_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, intype:$r, intype:$g, intype:$b, intype:$a)), inst # " \t[$s, \\{$x\\}], \\{$r, $g, $b, $a\\};", - []>; + pat>; multiclass SUST_1D_V4 { - def _R : SUST_1D_V4_base; - def _I : SUST_1D_V4_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_1D_V4_base; + def _I : SUST_1D_V4_base; } -defm SUST_B_1D_V4B8_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b8.clamp", Int16Regs>; -defm SUST_B_1D_V4B16_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b16.clamp", Int16Regs>; -defm SUST_B_1D_V4B32_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b32.clamp", Int32Regs>; +defm SUST_B_1D_V4I8_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b8.clamp", Int16Regs>; +defm SUST_B_1D_V4I16_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b16.clamp", Int16Regs>; +defm SUST_B_1D_V4I32_CLAMP : SUST_1D_V4<"sust.b.1d.v4.b32.clamp", Int32Regs>; -defm SUST_B_1D_V4B8_TRAP : SUST_1D_V4<"sust.b.1d.v4.b8.trap", Int16Regs>; -defm SUST_B_1D_V4B16_TRAP : SUST_1D_V4<"sust.b.1d.v4.b16.trap", Int16Regs>; -defm SUST_B_1D_V4B32_TRAP : SUST_1D_V4<"sust.b.1d.v4.b32.trap", Int32Regs>; +defm SUST_B_1D_V4I8_TRAP : SUST_1D_V4<"sust.b.1d.v4.b8.trap", Int16Regs>; +defm SUST_B_1D_V4I16_TRAP : SUST_1D_V4<"sust.b.1d.v4.b16.trap", Int16Regs>; +defm SUST_B_1D_V4I32_TRAP : SUST_1D_V4<"sust.b.1d.v4.b32.trap", Int32Regs>; -defm SUST_B_1D_V4B8_ZERO : SUST_1D_V4<"sust.b.1d.v4.b8.zero", Int16Regs>; -defm SUST_B_1D_V4B16_ZERO : SUST_1D_V4<"sust.b.1d.v4.b16.zero", Int16Regs>; -defm SUST_B_1D_V4B32_ZERO : SUST_1D_V4<"sust.b.1d.v4.b32.zero", Int32Regs>; +defm SUST_B_1D_V4I8_ZERO : SUST_1D_V4<"sust.b.1d.v4.b8.zero", Int16Regs>; +defm SUST_B_1D_V4I16_ZERO : SUST_1D_V4<"sust.b.1d.v4.b16.zero", Int16Regs>; +defm SUST_B_1D_V4I32_ZERO : SUST_1D_V4<"sust.b.1d.v4.b32.zero", Int32Regs>; -defm SUST_P_1D_V4B8_TRAP : SUST_1D_V4<"sust.p.1d.v4.b8.trap", Int16Regs>; -defm SUST_P_1D_V4B16_TRAP : SUST_1D_V4<"sust.p.1d.v4.b16.trap", Int16Regs>; -defm SUST_P_1D_V4B32_TRAP : SUST_1D_V4<"sust.p.1d.v4.b32.trap", Int32Regs>; +defm SUST_P_1D_V4I8_TRAP : SUST_1D_V4<"sust.p.1d.v4.b8.trap", Int16Regs>; +defm SUST_P_1D_V4I16_TRAP : SUST_1D_V4<"sust.p.1d.v4.b16.trap", Int16Regs>; +defm SUST_P_1D_V4I32_TRAP : SUST_1D_V4<"sust.p.1d.v4.b32.trap", Int32Regs>; -class SUST_1D_ARRAY_base +class SUST_1D_ARRAY_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, intype:$r)), inst # " \t[$s, \\{$idx, $x\\}], \\{$r\\};", - []>; + pat>; multiclass SUST_1D_ARRAY { - def _R : SUST_1D_ARRAY_base; - def _I : SUST_1D_ARRAY_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_1D_ARRAY_base; + def _I : SUST_1D_ARRAY_base; } -defm SUST_B_1D_ARRAY_B8_CLAMP +defm SUST_B_1D_ARRAY_I8_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b8.clamp", Int16Regs>; -defm SUST_B_1D_ARRAY_B16_CLAMP +defm SUST_B_1D_ARRAY_I16_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b16.clamp", Int16Regs>; -defm SUST_B_1D_ARRAY_B32_CLAMP +defm SUST_B_1D_ARRAY_I32_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b32.clamp", Int32Regs>; -defm SUST_B_1D_ARRAY_B64_CLAMP +defm SUST_B_1D_ARRAY_I64_CLAMP : SUST_1D_ARRAY<"sust.b.a1d.b64.clamp", Int64Regs>; -defm SUST_B_1D_ARRAY_B8_TRAP +defm SUST_B_1D_ARRAY_I8_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b8.trap", Int16Regs>; -defm SUST_B_1D_ARRAY_B16_TRAP +defm SUST_B_1D_ARRAY_I16_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b16.trap", Int16Regs>; -defm SUST_B_1D_ARRAY_B32_TRAP +defm SUST_B_1D_ARRAY_I32_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b32.trap", Int32Regs>; -defm SUST_B_1D_ARRAY_B64_TRAP +defm SUST_B_1D_ARRAY_I64_TRAP : SUST_1D_ARRAY<"sust.b.a1d.b64.trap", Int64Regs>; -defm SUST_B_1D_ARRAY_B8_ZERO +defm SUST_B_1D_ARRAY_I8_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b8.zero", Int16Regs>; -defm SUST_B_1D_ARRAY_B16_ZERO +defm SUST_B_1D_ARRAY_I16_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b16.zero", Int16Regs>; -defm SUST_B_1D_ARRAY_B32_ZERO +defm SUST_B_1D_ARRAY_I32_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b32.zero", Int32Regs>; -defm SUST_B_1D_ARRAY_B64_ZERO +defm SUST_B_1D_ARRAY_I64_ZERO : SUST_1D_ARRAY<"sust.b.a1d.b64.zero", Int64Regs>; -defm SUST_P_1D_ARRAY_B8_TRAP +defm SUST_P_1D_ARRAY_I8_TRAP : SUST_1D_ARRAY<"sust.p.a1d.b8.trap", Int16Regs>; -defm SUST_P_1D_ARRAY_B16_TRAP +defm SUST_P_1D_ARRAY_I16_TRAP : SUST_1D_ARRAY<"sust.p.a1d.b16.trap", Int16Regs>; -defm SUST_P_1D_ARRAY_B32_TRAP +defm SUST_P_1D_ARRAY_I32_TRAP : SUST_1D_ARRAY<"sust.p.a1d.b32.trap", Int32Regs>; -class SUST_1D_ARRAY_V2_base +class SUST_1D_ARRAY_V2_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, intype:$r, intype:$g)), inst # " \t[$s, \\{$idx, $x\\}], \\{$r, $g\\};", - []>; + pat>; multiclass SUST_1D_ARRAY_V2 { - def _R : SUST_1D_ARRAY_V2_base; - def _I : SUST_1D_ARRAY_V2_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_1D_ARRAY_V2_base; + def _I : SUST_1D_ARRAY_V2_base; } -defm SUST_B_1D_ARRAY_V2B8_CLAMP +defm SUST_B_1D_ARRAY_V2I8_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.clamp", Int16Regs>; -defm SUST_B_1D_ARRAY_V2B16_CLAMP +defm SUST_B_1D_ARRAY_V2I16_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.clamp", Int16Regs>; -defm SUST_B_1D_ARRAY_V2B32_CLAMP +defm SUST_B_1D_ARRAY_V2I32_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.clamp", Int32Regs>; -defm SUST_B_1D_ARRAY_V2B64_CLAMP +defm SUST_B_1D_ARRAY_V2I64_CLAMP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.clamp", Int64Regs>; -defm SUST_B_1D_ARRAY_V2B8_TRAP +defm SUST_B_1D_ARRAY_V2I8_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.trap", Int16Regs>; -defm SUST_B_1D_ARRAY_V2B16_TRAP +defm SUST_B_1D_ARRAY_V2I16_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.trap", Int16Regs>; -defm SUST_B_1D_ARRAY_V2B32_TRAP +defm SUST_B_1D_ARRAY_V2I32_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.trap", Int32Regs>; -defm SUST_B_1D_ARRAY_V2B64_TRAP +defm SUST_B_1D_ARRAY_V2I64_TRAP : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.trap", Int64Regs>; -defm SUST_B_1D_ARRAY_V2B8_ZERO +defm SUST_B_1D_ARRAY_V2I8_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b8.zero", Int16Regs>; -defm SUST_B_1D_ARRAY_V2B16_ZERO +defm SUST_B_1D_ARRAY_V2I16_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b16.zero", Int16Regs>; -defm SUST_B_1D_ARRAY_V2B32_ZERO +defm SUST_B_1D_ARRAY_V2I32_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b32.zero", Int32Regs>; -defm SUST_B_1D_ARRAY_V2B64_ZERO +defm SUST_B_1D_ARRAY_V2I64_ZERO : SUST_1D_ARRAY_V2<"sust.b.a1d.v2.b64.zero", Int64Regs>; -defm SUST_P_1D_ARRAY_V2B8_TRAP +defm SUST_P_1D_ARRAY_V2I8_TRAP : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b8.trap", Int16Regs>; -defm SUST_P_1D_ARRAY_V2B16_TRAP +defm SUST_P_1D_ARRAY_V2I16_TRAP : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b16.trap", Int16Regs>; -defm SUST_P_1D_ARRAY_V2B32_TRAP +defm SUST_P_1D_ARRAY_V2I32_TRAP : SUST_1D_ARRAY_V2<"sust.p.a1d.v2.b32.trap", Int32Regs>; -class SUST_1D_ARRAY_V4_base +class SUST_1D_ARRAY_V4_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, intype:$r, intype:$g, intype:$b, intype:$a)), inst # " \t[$s, \\{$idx, $x\\}], \\{$r, $g, $b, $a\\};", - []>; + pat>; multiclass SUST_1D_ARRAY_V4 { - def _R : SUST_1D_ARRAY_V4_base; - def _I : SUST_1D_ARRAY_V4_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_1D_ARRAY_V4_base; + def _I : SUST_1D_ARRAY_V4_base; } -defm SUST_B_1D_ARRAY_V4B8_CLAMP +defm SUST_B_1D_ARRAY_V4I8_CLAMP : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.clamp", Int16Regs>; -defm SUST_B_1D_ARRAY_V4B16_CLAMP +defm SUST_B_1D_ARRAY_V4I16_CLAMP : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.clamp", Int16Regs>; -defm SUST_B_1D_ARRAY_V4B32_CLAMP +defm SUST_B_1D_ARRAY_V4I32_CLAMP : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.clamp", Int32Regs>; -defm SUST_B_1D_ARRAY_V4B8_TRAP +defm SUST_B_1D_ARRAY_V4I8_TRAP : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.trap", Int16Regs>; -defm SUST_B_1D_ARRAY_V4B16_TRAP +defm SUST_B_1D_ARRAY_V4I16_TRAP : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.trap", Int16Regs>; -defm SUST_B_1D_ARRAY_V4B32_TRAP +defm SUST_B_1D_ARRAY_V4I32_TRAP : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.trap", Int32Regs>; -defm SUST_B_1D_ARRAY_V4B8_ZERO +defm SUST_B_1D_ARRAY_V4I8_ZERO : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b8.zero", Int16Regs>; -defm SUST_B_1D_ARRAY_V4B16_ZERO +defm SUST_B_1D_ARRAY_V4I16_ZERO : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b16.zero", Int16Regs>; -defm SUST_B_1D_ARRAY_V4B32_ZERO +defm SUST_B_1D_ARRAY_V4I32_ZERO : SUST_1D_ARRAY_V4<"sust.b.a1d.v4.b32.zero", Int32Regs>; -defm SUST_P_1D_ARRAY_V4B8_TRAP +defm SUST_P_1D_ARRAY_V4I8_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b8.trap", Int16Regs>; -defm SUST_P_1D_ARRAY_V4B16_TRAP +defm SUST_P_1D_ARRAY_V4I16_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b16.trap", Int16Regs>; -defm SUST_P_1D_ARRAY_V4B32_TRAP +defm SUST_P_1D_ARRAY_V4I32_TRAP : SUST_1D_ARRAY_V4<"sust.p.a1d.v4.b32.trap", Int32Regs>; -class SUST_2D_base +class SUST_2D_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, Int32Regs:$y, intype:$r)), inst # " \t[$s, \\{$x, $y\\}], \\{$r\\};", - []>; + pat>; multiclass SUST_2D { - def _R : SUST_2D_base; - def _I : SUST_2D_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_2D_base; + def _I : SUST_2D_base; } -defm SUST_B_2D_B8_CLAMP : SUST_2D<"sust.b.2d.b8.clamp", Int16Regs>; -defm SUST_B_2D_B16_CLAMP : SUST_2D<"sust.b.2d.b16.clamp", Int16Regs>; -defm SUST_B_2D_B32_CLAMP : SUST_2D<"sust.b.2d.b32.clamp", Int32Regs>; -defm SUST_B_2D_B64_CLAMP : SUST_2D<"sust.b.2d.b64.clamp", Int64Regs>; +defm SUST_B_2D_I8_CLAMP : SUST_2D<"sust.b.2d.b8.clamp", Int16Regs>; +defm SUST_B_2D_I16_CLAMP : SUST_2D<"sust.b.2d.b16.clamp", Int16Regs>; +defm SUST_B_2D_I32_CLAMP : SUST_2D<"sust.b.2d.b32.clamp", Int32Regs>; +defm SUST_B_2D_I64_CLAMP : SUST_2D<"sust.b.2d.b64.clamp", Int64Regs>; -defm SUST_B_2D_B8_TRAP : SUST_2D<"sust.b.2d.b8.trap", Int16Regs>; -defm SUST_B_2D_B16_TRAP : SUST_2D<"sust.b.2d.b16.trap", Int16Regs>; -defm SUST_B_2D_B32_TRAP : SUST_2D<"sust.b.2d.b32.trap", Int32Regs>; -defm SUST_B_2D_B64_TRAP : SUST_2D<"sust.b.2d.b64.trap", Int64Regs>; +defm SUST_B_2D_I8_TRAP : SUST_2D<"sust.b.2d.b8.trap", Int16Regs>; +defm SUST_B_2D_I16_TRAP : SUST_2D<"sust.b.2d.b16.trap", Int16Regs>; +defm SUST_B_2D_I32_TRAP : SUST_2D<"sust.b.2d.b32.trap", Int32Regs>; +defm SUST_B_2D_I64_TRAP : SUST_2D<"sust.b.2d.b64.trap", Int64Regs>; -defm SUST_B_2D_B8_ZERO : SUST_2D<"sust.b.2d.b8.zero", Int16Regs>; -defm SUST_B_2D_B16_ZERO : SUST_2D<"sust.b.2d.b16.zero", Int16Regs>; -defm SUST_B_2D_B32_ZERO : SUST_2D<"sust.b.2d.b32.zero", Int32Regs>; -defm SUST_B_2D_B64_ZERO : SUST_2D<"sust.b.2d.b64.zero", Int64Regs>; +defm SUST_B_2D_I8_ZERO : SUST_2D<"sust.b.2d.b8.zero", Int16Regs>; +defm SUST_B_2D_I16_ZERO : SUST_2D<"sust.b.2d.b16.zero", Int16Regs>; +defm SUST_B_2D_I32_ZERO : SUST_2D<"sust.b.2d.b32.zero", Int32Regs>; +defm SUST_B_2D_I64_ZERO : SUST_2D<"sust.b.2d.b64.zero", Int64Regs>; -defm SUST_P_2D_B8_TRAP : SUST_2D<"sust.p.2d.b8.trap", Int16Regs>; -defm SUST_P_2D_B16_TRAP : SUST_2D<"sust.p.2d.b16.trap", Int16Regs>; -defm SUST_P_2D_B32_TRAP : SUST_2D<"sust.p.2d.b32.trap", Int32Regs>; +defm SUST_P_2D_I8_TRAP : SUST_2D<"sust.p.2d.b8.trap", Int16Regs>; +defm SUST_P_2D_I16_TRAP : SUST_2D<"sust.p.2d.b16.trap", Int16Regs>; +defm SUST_P_2D_I32_TRAP : SUST_2D<"sust.p.2d.b32.trap", Int32Regs>; -class SUST_2D_V2_base +class SUST_2D_V2_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, Int32Regs:$y, intype:$r, intype:$g)), inst # " \t[$s, \\{$x, $y\\}], \\{$r, $g\\};", - []>; + pat>; multiclass SUST_2D_V2 { - def _R : SUST_2D_V2_base; - def _I : SUST_2D_V2_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_2D_V2_base; + def _I : SUST_2D_V2_base; } -defm SUST_B_2D_V2B8_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b8.clamp", Int16Regs>; -defm SUST_B_2D_V2B16_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b16.clamp", Int16Regs>; -defm SUST_B_2D_V2B32_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b32.clamp", Int32Regs>; -defm SUST_B_2D_V2B64_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b64.clamp", Int64Regs>; +defm SUST_B_2D_V2I8_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b8.clamp", Int16Regs>; +defm SUST_B_2D_V2I16_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b16.clamp", Int16Regs>; +defm SUST_B_2D_V2I32_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b32.clamp", Int32Regs>; +defm SUST_B_2D_V2I64_CLAMP : SUST_2D_V2<"sust.b.2d.v2.b64.clamp", Int64Regs>; -defm SUST_B_2D_V2B8_TRAP : SUST_2D_V2<"sust.b.2d.v2.b8.trap", Int16Regs>; -defm SUST_B_2D_V2B16_TRAP : SUST_2D_V2<"sust.b.2d.v2.b16.trap", Int16Regs>; -defm SUST_B_2D_V2B32_TRAP : SUST_2D_V2<"sust.b.2d.v2.b32.trap", Int32Regs>; -defm SUST_B_2D_V2B64_TRAP : SUST_2D_V2<"sust.b.2d.v2.b64.trap", Int64Regs>; +defm SUST_B_2D_V2I8_TRAP : SUST_2D_V2<"sust.b.2d.v2.b8.trap", Int16Regs>; +defm SUST_B_2D_V2I16_TRAP : SUST_2D_V2<"sust.b.2d.v2.b16.trap", Int16Regs>; +defm SUST_B_2D_V2I32_TRAP : SUST_2D_V2<"sust.b.2d.v2.b32.trap", Int32Regs>; +defm SUST_B_2D_V2I64_TRAP : SUST_2D_V2<"sust.b.2d.v2.b64.trap", Int64Regs>; -defm SUST_B_2D_V2B8_ZERO : SUST_2D_V2<"sust.b.2d.v2.b8.zero", Int16Regs>; -defm SUST_B_2D_V2B16_ZERO : SUST_2D_V2<"sust.b.2d.v2.b16.zero", Int16Regs>; -defm SUST_B_2D_V2B32_ZERO : SUST_2D_V2<"sust.b.2d.v2.b32.zero", Int32Regs>; -defm SUST_B_2D_V2B64_ZERO : SUST_2D_V2<"sust.b.2d.v2.b64.zero", Int64Regs>; +defm SUST_B_2D_V2I8_ZERO : SUST_2D_V2<"sust.b.2d.v2.b8.zero", Int16Regs>; +defm SUST_B_2D_V2I16_ZERO : SUST_2D_V2<"sust.b.2d.v2.b16.zero", Int16Regs>; +defm SUST_B_2D_V2I32_ZERO : SUST_2D_V2<"sust.b.2d.v2.b32.zero", Int32Regs>; +defm SUST_B_2D_V2I64_ZERO : SUST_2D_V2<"sust.b.2d.v2.b64.zero", Int64Regs>; -defm SUST_P_2D_V2B8_TRAP : SUST_2D_V2<"sust.p.2d.v2.b8.trap", Int16Regs>; -defm SUST_P_2D_V2B16_TRAP : SUST_2D_V2<"sust.p.2d.v2.b16.trap", Int16Regs>; -defm SUST_P_2D_V2B32_TRAP : SUST_2D_V2<"sust.p.2d.v2.b32.trap", Int32Regs>; +defm SUST_P_2D_V2I8_TRAP : SUST_2D_V2<"sust.p.2d.v2.b8.trap", Int16Regs>; +defm SUST_P_2D_V2I16_TRAP : SUST_2D_V2<"sust.p.2d.v2.b16.trap", Int16Regs>; +defm SUST_P_2D_V2I32_TRAP : SUST_2D_V2<"sust.p.2d.v2.b32.trap", Int32Regs>; -class SUST_2D_V4_base +class SUST_2D_V4_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, Int32Regs:$y, intype:$r, intype:$g, intype:$b, intype:$a)), inst # " \t[$s, \\{$x, $y\\}], \\{$r, $g, $b, $a\\};", - []>; + pat>; multiclass SUST_2D_V4 { - def _R : SUST_2D_V4_base; - def _I : SUST_2D_V4_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_2D_V4_base; + def _I : SUST_2D_V4_base; } -defm SUST_B_2D_V4B8_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b8.clamp", Int16Regs>; -defm SUST_B_2D_V4B16_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b16.clamp", Int16Regs>; -defm SUST_B_2D_V4B32_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b32.clamp", Int32Regs>; +defm SUST_B_2D_V4I8_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b8.clamp", Int16Regs>; +defm SUST_B_2D_V4I16_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b16.clamp", Int16Regs>; +defm SUST_B_2D_V4I32_CLAMP : SUST_2D_V4<"sust.b.2d.v4.b32.clamp", Int32Regs>; -defm SUST_B_2D_V4B8_TRAP : SUST_2D_V4<"sust.b.2d.v4.b8.trap", Int16Regs>; -defm SUST_B_2D_V4B16_TRAP : SUST_2D_V4<"sust.b.2d.v4.b16.trap", Int16Regs>; -defm SUST_B_2D_V4B32_TRAP : SUST_2D_V4<"sust.b.2d.v4.b32.trap", Int32Regs>; +defm SUST_B_2D_V4I8_TRAP : SUST_2D_V4<"sust.b.2d.v4.b8.trap", Int16Regs>; +defm SUST_B_2D_V4I16_TRAP : SUST_2D_V4<"sust.b.2d.v4.b16.trap", Int16Regs>; +defm SUST_B_2D_V4I32_TRAP : SUST_2D_V4<"sust.b.2d.v4.b32.trap", Int32Regs>; -defm SUST_B_2D_V4B8_ZERO : SUST_2D_V4<"sust.b.2d.v4.b8.zero", Int16Regs>; -defm SUST_B_2D_V4B16_ZERO : SUST_2D_V4<"sust.b.2d.v4.b16.zero", Int16Regs>; -defm SUST_B_2D_V4B32_ZERO : SUST_2D_V4<"sust.b.2d.v4.b32.zero", Int32Regs>; +defm SUST_B_2D_V4I8_ZERO : SUST_2D_V4<"sust.b.2d.v4.b8.zero", Int16Regs>; +defm SUST_B_2D_V4I16_ZERO : SUST_2D_V4<"sust.b.2d.v4.b16.zero", Int16Regs>; +defm SUST_B_2D_V4I32_ZERO : SUST_2D_V4<"sust.b.2d.v4.b32.zero", Int32Regs>; -defm SUST_P_2D_V4B8_TRAP : SUST_2D_V4<"sust.p.2d.v4.b8.trap", Int16Regs>; -defm SUST_P_2D_V4B16_TRAP : SUST_2D_V4<"sust.p.2d.v4.b16.trap", Int16Regs>; -defm SUST_P_2D_V4B32_TRAP : SUST_2D_V4<"sust.p.2d.v4.b32.trap", Int32Regs>; +defm SUST_P_2D_V4I8_TRAP : SUST_2D_V4<"sust.p.2d.v4.b8.trap", Int16Regs>; +defm SUST_P_2D_V4I16_TRAP : SUST_2D_V4<"sust.p.2d.v4.b16.trap", Int16Regs>; +defm SUST_P_2D_V4I32_TRAP : SUST_2D_V4<"sust.p.2d.v4.b32.trap", Int32Regs>; -class SUST_2D_ARRAY_base +class SUST_2D_ARRAY_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, intype:$r)), inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r\\};", - []>; + pat>; multiclass SUST_2D_ARRAY { - def _R : SUST_2D_ARRAY_base; - def _I : SUST_2D_ARRAY_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_2D_ARRAY_base; + def _I : SUST_2D_ARRAY_base; } -defm SUST_B_2D_ARRAY_B8_CLAMP +defm SUST_B_2D_ARRAY_I8_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b8.clamp", Int16Regs>; -defm SUST_B_2D_ARRAY_B16_CLAMP +defm SUST_B_2D_ARRAY_I16_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b16.clamp", Int16Regs>; -defm SUST_B_2D_ARRAY_B32_CLAMP +defm SUST_B_2D_ARRAY_I32_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b32.clamp", Int32Regs>; -defm SUST_B_2D_ARRAY_B64_CLAMP +defm SUST_B_2D_ARRAY_I64_CLAMP : SUST_2D_ARRAY<"sust.b.a2d.b64.clamp", Int64Regs>; -defm SUST_B_2D_ARRAY_B8_TRAP +defm SUST_B_2D_ARRAY_I8_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b8.trap", Int16Regs>; -defm SUST_B_2D_ARRAY_B16_TRAP +defm SUST_B_2D_ARRAY_I16_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b16.trap", Int16Regs>; -defm SUST_B_2D_ARRAY_B32_TRAP +defm SUST_B_2D_ARRAY_I32_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b32.trap", Int32Regs>; -defm SUST_B_2D_ARRAY_B64_TRAP +defm SUST_B_2D_ARRAY_I64_TRAP : SUST_2D_ARRAY<"sust.b.a2d.b64.trap", Int64Regs>; -defm SUST_B_2D_ARRAY_B8_ZERO +defm SUST_B_2D_ARRAY_I8_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b8.zero", Int16Regs>; -defm SUST_B_2D_ARRAY_B16_ZERO +defm SUST_B_2D_ARRAY_I16_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b16.zero", Int16Regs>; -defm SUST_B_2D_ARRAY_B32_ZERO +defm SUST_B_2D_ARRAY_I32_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b32.zero", Int32Regs>; -defm SUST_B_2D_ARRAY_B64_ZERO +defm SUST_B_2D_ARRAY_I64_ZERO : SUST_2D_ARRAY<"sust.b.a2d.b64.zero", Int64Regs>; -defm SUST_P_2D_ARRAY_B8_TRAP +defm SUST_P_2D_ARRAY_I8_TRAP : SUST_2D_ARRAY<"sust.p.a2d.b8.trap", Int16Regs>; -defm SUST_P_2D_ARRAY_B16_TRAP +defm SUST_P_2D_ARRAY_I16_TRAP : SUST_2D_ARRAY<"sust.p.a2d.b16.trap", Int16Regs>; -defm SUST_P_2D_ARRAY_B32_TRAP +defm SUST_P_2D_ARRAY_I32_TRAP : SUST_2D_ARRAY<"sust.p.a2d.b32.trap", Int32Regs>; -class SUST_2D_ARRAY_V2_base +class SUST_2D_ARRAY_V2_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, intype:$r, intype:$g)), inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r, $g\\};", - []>; + pat>; multiclass SUST_2D_ARRAY_V2 { - def _R : SUST_2D_ARRAY_V2_base; - def _I : SUST_2D_ARRAY_V2_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_2D_ARRAY_V2_base; + def _I : SUST_2D_ARRAY_V2_base; } -defm SUST_B_2D_ARRAY_V2B8_CLAMP +defm SUST_B_2D_ARRAY_V2I8_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.clamp", Int16Regs>; -defm SUST_B_2D_ARRAY_V2B16_CLAMP +defm SUST_B_2D_ARRAY_V2I16_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.clamp", Int16Regs>; -defm SUST_B_2D_ARRAY_V2B32_CLAMP +defm SUST_B_2D_ARRAY_V2I32_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.clamp", Int32Regs>; -defm SUST_B_2D_ARRAY_V2B64_CLAMP +defm SUST_B_2D_ARRAY_V2I64_CLAMP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.clamp", Int64Regs>; -defm SUST_B_2D_ARRAY_V2B8_TRAP +defm SUST_B_2D_ARRAY_V2I8_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.trap", Int16Regs>; -defm SUST_B_2D_ARRAY_V2B16_TRAP +defm SUST_B_2D_ARRAY_V2I16_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.trap", Int16Regs>; -defm SUST_B_2D_ARRAY_V2B32_TRAP +defm SUST_B_2D_ARRAY_V2I32_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.trap", Int32Regs>; -defm SUST_B_2D_ARRAY_V2B64_TRAP +defm SUST_B_2D_ARRAY_V2I64_TRAP : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.trap", Int64Regs>; -defm SUST_B_2D_ARRAY_V2B8_ZERO +defm SUST_B_2D_ARRAY_V2I8_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b8.zero", Int16Regs>; -defm SUST_B_2D_ARRAY_V2B16_ZERO +defm SUST_B_2D_ARRAY_V2I16_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b16.zero", Int16Regs>; -defm SUST_B_2D_ARRAY_V2B32_ZERO +defm SUST_B_2D_ARRAY_V2I32_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b32.zero", Int32Regs>; -defm SUST_B_2D_ARRAY_V2B64_ZERO +defm SUST_B_2D_ARRAY_V2I64_ZERO : SUST_2D_ARRAY_V2<"sust.b.a2d.v2.b64.zero", Int64Regs>; -defm SUST_P_2D_ARRAY_V2B8_TRAP +defm SUST_P_2D_ARRAY_V2I8_TRAP : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b8.trap", Int16Regs>; -defm SUST_P_2D_ARRAY_V2B16_TRAP +defm SUST_P_2D_ARRAY_V2I16_TRAP : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b16.trap", Int16Regs>; -defm SUST_P_2D_ARRAY_V2B32_TRAP +defm SUST_P_2D_ARRAY_V2I32_TRAP : SUST_2D_ARRAY_V2<"sust.p.a2d.v2.b32.trap", Int32Regs>; -class SUST_2D_ARRAY_V4_base +class SUST_2D_ARRAY_V4_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$idx, Int32Regs:$x, Int32Regs:$y, intype:$r, intype:$g, intype:$b, intype:$a)), inst # " \t[$s, \\{$idx, $x, $y, $y\\}], \\{$r, $g, $b, $a\\};", - []>; + pat>; multiclass SUST_2D_ARRAY_V4 { - def _R : SUST_2D_ARRAY_V4_base; - def _I : SUST_2D_ARRAY_V4_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_2D_ARRAY_V4_base; + def _I : SUST_2D_ARRAY_V4_base; } -defm SUST_B_2D_ARRAY_V4B8_CLAMP +defm SUST_B_2D_ARRAY_V4I8_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.clamp", Int16Regs>; -defm SUST_B_2D_ARRAY_V4B16_CLAMP +defm SUST_B_2D_ARRAY_V4I16_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.clamp", Int16Regs>; -defm SUST_B_2D_ARRAY_V4B32_CLAMP +defm SUST_B_2D_ARRAY_V4I32_CLAMP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.clamp", Int32Regs>; -defm SUST_B_2D_ARRAY_V4B8_TRAP +defm SUST_B_2D_ARRAY_V4I8_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.trap", Int16Regs>; -defm SUST_B_2D_ARRAY_V4B16_TRAP +defm SUST_B_2D_ARRAY_V4I16_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.trap", Int16Regs>; -defm SUST_B_2D_ARRAY_V4B32_TRAP +defm SUST_B_2D_ARRAY_V4I32_TRAP : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.trap", Int32Regs>; -defm SUST_B_2D_ARRAY_V4B8_ZERO +defm SUST_B_2D_ARRAY_V4I8_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b8.zero", Int16Regs>; -defm SUST_B_2D_ARRAY_V4B16_ZERO +defm SUST_B_2D_ARRAY_V4I16_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b16.zero", Int16Regs>; -defm SUST_B_2D_ARRAY_V4B32_ZERO +defm SUST_B_2D_ARRAY_V4I32_ZERO : SUST_2D_ARRAY_V4<"sust.b.a2d.v4.b32.zero", Int32Regs>; -defm SUST_P_2D_ARRAY_V4B8_TRAP +defm SUST_P_2D_ARRAY_V4I8_TRAP : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b8.trap", Int16Regs>; -defm SUST_P_2D_ARRAY_V4B16_TRAP +defm SUST_P_2D_ARRAY_V4I16_TRAP : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b16.trap", Int16Regs>; -defm SUST_P_2D_ARRAY_V4B32_TRAP +defm SUST_P_2D_ARRAY_V4I32_TRAP : SUST_2D_ARRAY_V4<"sust.p.a2d.v4.b32.trap", Int32Regs>; -class SUST_3D_base +class SUST_3D_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, intype:$r)), inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r\\};", - []>; + pat>; multiclass SUST_3D { - def _R : SUST_3D_base; - def _I : SUST_3D_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_3D_base; + def _I : SUST_3D_base; } -defm SUST_B_3D_B8_CLAMP : SUST_3D<"sust.b.3d.b8.clamp", Int16Regs>; -defm SUST_B_3D_B16_CLAMP : SUST_3D<"sust.b.3d.b16.clamp", Int16Regs>; -defm SUST_B_3D_B32_CLAMP : SUST_3D<"sust.b.3d.b32.clamp", Int32Regs>; -defm SUST_B_3D_B64_CLAMP : SUST_3D<"sust.b.3d.b64.clamp", Int64Regs>; +defm SUST_B_3D_I8_CLAMP : SUST_3D<"sust.b.3d.b8.clamp", Int16Regs>; +defm SUST_B_3D_I16_CLAMP : SUST_3D<"sust.b.3d.b16.clamp", Int16Regs>; +defm SUST_B_3D_I32_CLAMP : SUST_3D<"sust.b.3d.b32.clamp", Int32Regs>; +defm SUST_B_3D_I64_CLAMP : SUST_3D<"sust.b.3d.b64.clamp", Int64Regs>; -defm SUST_B_3D_B8_TRAP : SUST_3D<"sust.b.3d.b8.trap", Int16Regs>; -defm SUST_B_3D_B16_TRAP : SUST_3D<"sust.b.3d.b16.trap", Int16Regs>; -defm SUST_B_3D_B32_TRAP : SUST_3D<"sust.b.3d.b32.trap", Int32Regs>; -defm SUST_B_3D_B64_TRAP : SUST_3D<"sust.b.3d.b64.trap", Int64Regs>; +defm SUST_B_3D_I8_TRAP : SUST_3D<"sust.b.3d.b8.trap", Int16Regs>; +defm SUST_B_3D_I16_TRAP : SUST_3D<"sust.b.3d.b16.trap", Int16Regs>; +defm SUST_B_3D_I32_TRAP : SUST_3D<"sust.b.3d.b32.trap", Int32Regs>; +defm SUST_B_3D_I64_TRAP : SUST_3D<"sust.b.3d.b64.trap", Int64Regs>; -defm SUST_B_3D_B8_ZERO : SUST_3D<"sust.b.3d.b8.zero", Int16Regs>; -defm SUST_B_3D_B16_ZERO : SUST_3D<"sust.b.3d.b16.zero", Int16Regs>; -defm SUST_B_3D_B32_ZERO : SUST_3D<"sust.b.3d.b32.zero", Int32Regs>; -defm SUST_B_3D_B64_ZERO : SUST_3D<"sust.b.3d.b64.zero", Int64Regs>; +defm SUST_B_3D_I8_ZERO : SUST_3D<"sust.b.3d.b8.zero", Int16Regs>; +defm SUST_B_3D_I16_ZERO : SUST_3D<"sust.b.3d.b16.zero", Int16Regs>; +defm SUST_B_3D_I32_ZERO : SUST_3D<"sust.b.3d.b32.zero", Int32Regs>; +defm SUST_B_3D_I64_ZERO : SUST_3D<"sust.b.3d.b64.zero", Int64Regs>; -defm SUST_P_3D_B8_TRAP : SUST_3D<"sust.p.3d.b8.trap", Int16Regs>; -defm SUST_P_3D_B16_TRAP : SUST_3D<"sust.p.3d.b16.trap", Int16Regs>; -defm SUST_P_3D_B32_TRAP : SUST_3D<"sust.p.3d.b32.trap", Int32Regs>; +defm SUST_P_3D_I8_TRAP : SUST_3D<"sust.p.3d.b8.trap", Int16Regs>; +defm SUST_P_3D_I16_TRAP : SUST_3D<"sust.p.3d.b16.trap", Int16Regs>; +defm SUST_P_3D_I32_TRAP : SUST_3D<"sust.p.3d.b32.trap", Int32Regs>; -class SUST_3D_V2_base +class SUST_3D_V2_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, intype:$r, intype:$g)), inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r, $g\\};", - []>; + pat>; multiclass SUST_3D_V2 { - def _R : SUST_3D_V2_base; - def _I : SUST_3D_V2_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_3D_V2_base; + def _I : SUST_3D_V2_base; } -defm SUST_B_3D_V2B8_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b8.clamp", Int16Regs>; -defm SUST_B_3D_V2B16_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b16.clamp", Int16Regs>; -defm SUST_B_3D_V2B32_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b32.clamp", Int32Regs>; -defm SUST_B_3D_V2B64_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b64.clamp", Int64Regs>; +defm SUST_B_3D_V2I8_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b8.clamp", Int16Regs>; +defm SUST_B_3D_V2I16_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b16.clamp", Int16Regs>; +defm SUST_B_3D_V2I32_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b32.clamp", Int32Regs>; +defm SUST_B_3D_V2I64_CLAMP : SUST_3D_V2<"sust.b.3d.v2.b64.clamp", Int64Regs>; -defm SUST_B_3D_V2B8_TRAP : SUST_3D_V2<"sust.b.3d.v2.b8.trap", Int16Regs>; -defm SUST_B_3D_V2B16_TRAP : SUST_3D_V2<"sust.b.3d.v2.b16.trap", Int16Regs>; -defm SUST_B_3D_V2B32_TRAP : SUST_3D_V2<"sust.b.3d.v2.b32.trap", Int32Regs>; -defm SUST_B_3D_V2B64_TRAP : SUST_3D_V2<"sust.b.3d.v2.b64.trap", Int64Regs>; +defm SUST_B_3D_V2I8_TRAP : SUST_3D_V2<"sust.b.3d.v2.b8.trap", Int16Regs>; +defm SUST_B_3D_V2I16_TRAP : SUST_3D_V2<"sust.b.3d.v2.b16.trap", Int16Regs>; +defm SUST_B_3D_V2I32_TRAP : SUST_3D_V2<"sust.b.3d.v2.b32.trap", Int32Regs>; +defm SUST_B_3D_V2I64_TRAP : SUST_3D_V2<"sust.b.3d.v2.b64.trap", Int64Regs>; -defm SUST_B_3D_V2B8_ZERO : SUST_3D_V2<"sust.b.3d.v2.b8.zero", Int16Regs>; -defm SUST_B_3D_V2B16_ZERO : SUST_3D_V2<"sust.b.3d.v2.b16.zero", Int16Regs>; -defm SUST_B_3D_V2B32_ZERO : SUST_3D_V2<"sust.b.3d.v2.b32.zero", Int32Regs>; -defm SUST_B_3D_V2B64_ZERO : SUST_3D_V2<"sust.b.3d.v2.b64.zero", Int64Regs>; +defm SUST_B_3D_V2I8_ZERO : SUST_3D_V2<"sust.b.3d.v2.b8.zero", Int16Regs>; +defm SUST_B_3D_V2I16_ZERO : SUST_3D_V2<"sust.b.3d.v2.b16.zero", Int16Regs>; +defm SUST_B_3D_V2I32_ZERO : SUST_3D_V2<"sust.b.3d.v2.b32.zero", Int32Regs>; +defm SUST_B_3D_V2I64_ZERO : SUST_3D_V2<"sust.b.3d.v2.b64.zero", Int64Regs>; -defm SUST_P_3D_V2B8_TRAP : SUST_3D_V2<"sust.p.3d.v2.b8.trap", Int16Regs>; -defm SUST_P_3D_V2B16_TRAP : SUST_3D_V2<"sust.p.3d.v2.b16.trap", Int16Regs>; -defm SUST_P_3D_V2B32_TRAP : SUST_3D_V2<"sust.p.3d.v2.b32.trap", Int32Regs>; +defm SUST_P_3D_V2I8_TRAP : SUST_3D_V2<"sust.p.3d.v2.b8.trap", Int16Regs>; +defm SUST_P_3D_V2I16_TRAP : SUST_3D_V2<"sust.p.3d.v2.b16.trap", Int16Regs>; +defm SUST_P_3D_V2I32_TRAP : SUST_3D_V2<"sust.p.3d.v2.b32.trap", Int32Regs>; -class SUST_3D_V4_base +class SUST_3D_V4_base pat> : NVPTXInst<(outs), !con(surf, (ins Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, intype:$r, intype:$g, intype:$b, intype:$a)), inst # " \t[$s, \\{$x, $y, $z, $z\\}], \\{$r, $g, $b, $a\\};", - []>; + pat>; multiclass SUST_3D_V4 { - def _R : SUST_3D_V4_base; - def _I : SUST_3D_V4_base; + defvar intr = !cast("int_nvvm_" # !tolower(NAME)); + def _R : SUST_3D_V4_base; + def _I : SUST_3D_V4_base; } -defm SUST_B_3D_V4B8_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b8.clamp", Int16Regs>; -defm SUST_B_3D_V4B16_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b16.clamp", Int16Regs>; -defm SUST_B_3D_V4B32_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b32.clamp", Int32Regs>; +defm SUST_B_3D_V4I8_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b8.clamp", Int16Regs>; +defm SUST_B_3D_V4I16_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b16.clamp", Int16Regs>; +defm SUST_B_3D_V4I32_CLAMP : SUST_3D_V4<"sust.b.3d.v4.b32.clamp", Int32Regs>; + +defm SUST_B_3D_V4I8_TRAP : SUST_3D_V4<"sust.b.3d.v4.b8.trap", Int16Regs>; +defm SUST_B_3D_V4I16_TRAP : SUST_3D_V4<"sust.b.3d.v4.b16.trap", Int16Regs>; +defm SUST_B_3D_V4I32_TRAP : SUST_3D_V4<"sust.b.3d.v4.b32.trap", Int32Regs>; + +defm SUST_B_3D_V4I8_ZERO : SUST_3D_V4<"sust.b.3d.v4.b8.zero", Int16Regs>; +defm SUST_B_3D_V4I16_ZERO : SUST_3D_V4<"sust.b.3d.v4.b16.zero", Int16Regs>; +defm SUST_B_3D_V4I32_ZERO : SUST_3D_V4<"sust.b.3d.v4.b32.zero", Int32Regs>; -defm SUST_B_3D_V4B8_TRAP : SUST_3D_V4<"sust.b.3d.v4.b8.trap", Int16Regs>; -defm SUST_B_3D_V4B16_TRAP : SUST_3D_V4<"sust.b.3d.v4.b16.trap", Int16Regs>; -defm SUST_B_3D_V4B32_TRAP : SUST_3D_V4<"sust.b.3d.v4.b32.trap", Int32Regs>; - -defm SUST_B_3D_V4B8_ZERO : SUST_3D_V4<"sust.b.3d.v4.b8.zero", Int16Regs>; -defm SUST_B_3D_V4B16_ZERO : SUST_3D_V4<"sust.b.3d.v4.b16.zero", Int16Regs>; -defm SUST_B_3D_V4B32_ZERO : SUST_3D_V4<"sust.b.3d.v4.b32.zero", Int32Regs>; - -defm SUST_P_3D_V4B8_TRAP : SUST_3D_V4<"sust.p.3d.v4.b8.trap", Int16Regs>; -defm SUST_P_3D_V4B16_TRAP : SUST_3D_V4<"sust.p.3d.v4.b16.trap", Int16Regs>; -defm SUST_P_3D_V4B32_TRAP : SUST_3D_V4<"sust.p.3d.v4.b32.trap", Int32Regs>; - -} - -// Surface store instruction patterns -// I'm not sure why we can't just include these in the instruction definitions, -// but TableGen complains of type errors :( - -// .clamp variant -def : Pat<(int_nvvm_sust_b_1d_i8_clamp - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i16_clamp - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), - (SUST_B_1D_B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i64_clamp - Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), - (SUST_B_1D_B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>; +defm SUST_P_3D_V4I8_TRAP : SUST_3D_V4<"sust.p.3d.v4.b8.trap", Int16Regs>; +defm SUST_P_3D_V4I16_TRAP : SUST_3D_V4<"sust.p.3d.v4.b16.trap", Int16Regs>; +defm SUST_P_3D_V4I32_TRAP : SUST_3D_V4<"sust.p.3d.v4.b32.trap", Int32Regs>; + +} -def : Pat<(int_nvvm_sust_b_1d_v2i8_clamp - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i16_clamp - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_B_1D_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i64_clamp - Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), - (SUST_B_1D_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i8_clamp - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i16_clamp - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i32_clamp - Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_1D_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_1d_array_i8_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_ARRAY_B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i16_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_ARRAY_B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i32_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r), - (SUST_B_1D_ARRAY_B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i64_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r), - (SUST_B_1D_ARRAY_B64_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i8_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_ARRAY_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i16_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_ARRAY_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i32_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_B_1D_ARRAY_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i64_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), - (SUST_B_1D_ARRAY_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i8_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_ARRAY_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i16_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_ARRAY_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i32_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_1D_ARRAY_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_2d_i8_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i16_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_B_2D_B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i64_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), - (SUST_B_2D_B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i8_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i16_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), - (SUST_B_2D_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i64_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g), - (SUST_B_2D_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i8_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_V4B8_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i16_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_V4B16_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_2D_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_2d_array_i8_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_ARRAY_B8_CLAMP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i16_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_ARRAY_B16_CLAMP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i32_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_B_2D_ARRAY_B32_CLAMP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i64_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), - (SUST_B_2D_ARRAY_B64_CLAMP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i8_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_ARRAY_V2B8_CLAMP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i16_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_ARRAY_V2B16_CLAMP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i32_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, - Int32Regs:$g), - (SUST_B_2D_ARRAY_V2B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i64_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, - Int64Regs:$g), - (SUST_B_2D_ARRAY_V2B64_CLAMP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i8_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_ARRAY_V4B8_CLAMP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i16_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_ARRAY_V4B16_CLAMP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i32_clamp - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_2D_ARRAY_V4B32_CLAMP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_3d_i8_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_B_3D_B8_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i16_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_B_3D_B16_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r), - (SUST_B_3D_B32_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i64_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r), - (SUST_B_3D_B64_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i8_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_3D_V2B8_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i16_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_3D_V2B16_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g), - (SUST_B_3D_V2B32_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i64_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r, Int64Regs:$g), - (SUST_B_3D_V2B64_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i8_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_3D_V4B8_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i16_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_3D_V4B16_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i32_clamp - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_3D_V4B32_CLAMP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - -// .trap variant -def : Pat<(int_nvvm_sust_b_1d_i8_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i16_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), - (SUST_B_1D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i64_trap - Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), - (SUST_B_1D_B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i8_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i16_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_B_1D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i64_trap - Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), - (SUST_B_1D_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i8_trap - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i16_trap - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i32_trap - Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_1D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_1d_array_i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_ARRAY_B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_ARRAY_B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r), - (SUST_B_1D_ARRAY_B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i64_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r), - (SUST_B_1D_ARRAY_B64_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_B_1D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i64_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), - (SUST_B_1D_ARRAY_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_ARRAY_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_ARRAY_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_1D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_2d_i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_B_2D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i64_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), - (SUST_B_2D_B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), - (SUST_B_2D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i64_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g), - (SUST_B_2D_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_2D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_2d_array_i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_ARRAY_B8_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_ARRAY_B16_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_B_2D_ARRAY_B32_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i64_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), - (SUST_B_2D_ARRAY_B64_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, - Int32Regs:$g), - (SUST_B_2D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i64_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, - Int64Regs:$g), - (SUST_B_2D_ARRAY_V2B64_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_ARRAY_V4B8_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_ARRAY_V4B16_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_2D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_3d_i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_B_3D_B8_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_B_3D_B16_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r), - (SUST_B_3D_B32_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i64_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r), - (SUST_B_3D_B64_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_3D_V2B8_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_3D_V2B16_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g), - (SUST_B_3D_V2B32_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i64_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r, Int64Regs:$g), - (SUST_B_3D_V2B64_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_3D_V4B8_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_3D_V4B16_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_3D_V4B32_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - -// .zero variant -def : Pat<(int_nvvm_sust_b_1d_i8_zero - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i16_zero - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), - (SUST_B_1D_B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_i64_zero - Int64Regs:$s, Int32Regs:$x, Int64Regs:$r), - (SUST_B_1D_B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i8_zero - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i16_zero - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_B_1D_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v2i64_zero - Int64Regs:$s, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), - (SUST_B_1D_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$x, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i8_zero - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i16_zero - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_v4i32_zero - Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_1D_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_1d_array_i8_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_ARRAY_B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i16_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_B_1D_ARRAY_B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i32_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r), - (SUST_B_1D_ARRAY_B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_i64_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r), - (SUST_B_1D_ARRAY_B64_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i8_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_ARRAY_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i16_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_B_1D_ARRAY_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i32_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_B_1D_ARRAY_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v2i64_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int64Regs:$r, Int64Regs:$g), - (SUST_B_1D_ARRAY_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i8_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_ARRAY_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i16_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_1D_ARRAY_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_1d_array_v4i32_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_1D_ARRAY_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_2d_i8_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i16_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_B_2D_B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_i64_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), - (SUST_B_2D_B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i8_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i16_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), - (SUST_B_2D_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v2i64_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g), - (SUST_B_2D_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i8_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_V4B8_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i16_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_V4B16_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_v4i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_2D_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_2d_array_i8_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_ARRAY_B8_ZERO_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i16_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_B_2D_ARRAY_B16_ZERO_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i32_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_B_2D_ARRAY_B32_ZERO_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_i64_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r), - (SUST_B_2D_ARRAY_B64_ZERO_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i8_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_ARRAY_V2B8_ZERO_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i16_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_2D_ARRAY_V2B16_ZERO_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i32_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, - Int32Regs:$g), - (SUST_B_2D_ARRAY_V2B32_ZERO_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v2i64_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, - Int64Regs:$g), - (SUST_B_2D_ARRAY_V2B64_ZERO_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i8_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_ARRAY_V4B8_ZERO_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i16_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_2D_ARRAY_V4B16_ZERO_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_2d_array_v4i32_zero - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_2D_ARRAY_V4B32_ZERO_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_b_3d_i8_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_B_3D_B8_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i16_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_B_3D_B16_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r), - (SUST_B_3D_B32_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_i64_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r), - (SUST_B_3D_B64_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i8_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_3D_V2B8_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i16_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_B_3D_V2B16_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g), - (SUST_B_3D_V2B32_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v2i64_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r, Int64Regs:$g), - (SUST_B_3D_V2B64_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int64Regs:$r, Int64Regs:$g)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i8_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_3D_V4B8_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i16_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_B_3D_V4B16_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_b_3d_v4i32_zero - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_B_3D_V4B32_ZERO_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - - -def : Pat<(int_nvvm_sust_p_1d_i8_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_P_1D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_1d_i16_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r), - (SUST_P_1D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_1d_i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r), - (SUST_P_1D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_1d_v2i8_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_P_1D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_1d_v2i16_trap - Int64Regs:$s, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_P_1D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_1d_v2i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_P_1D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_1d_v4i8_trap - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_1D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_1d_v4i16_trap - Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_1D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_1d_v4i32_trap - Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_P_1D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_p_1d_array_i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_P_1D_ARRAY_B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_1d_array_i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r), - (SUST_P_1D_ARRAY_B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_1d_array_i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r), - (SUST_P_1D_ARRAY_B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_1d_array_v2i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_P_1D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_1d_array_v2i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int16Regs:$r, Int16Regs:$g), - (SUST_P_1D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_1d_array_v2i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$r, Int32Regs:$g), - (SUST_P_1D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_1d_array_v4i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_1D_ARRAY_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_1d_array_v4i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_1D_ARRAY_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_1d_array_v4i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_P_1D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_p_2d_i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_P_2D_B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_2d_i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_P_2D_B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_2d_i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_P_2D_B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_2d_v2i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_P_2D_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_2d_v2i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r, Int16Regs:$g), - (SUST_P_2D_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_2d_v2i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g), - (SUST_P_2D_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_2d_v4i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_2D_V4B8_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_2d_v4i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_2D_V4B16_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_2d_v4i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_P_2D_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_p_2d_array_i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_P_2D_ARRAY_B8_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_2d_array_i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int16Regs:$r), - (SUST_P_2D_ARRAY_B16_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_2d_array_i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r), - (SUST_P_2D_ARRAY_B32_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_2d_array_v2i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_P_2D_ARRAY_V2B8_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_2d_array_v2i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g), - (SUST_P_2D_ARRAY_V2B16_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_2d_array_v2i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, - Int32Regs:$g), - (SUST_P_2D_ARRAY_V2B32_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_2d_array_v4i8_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_2D_ARRAY_V4B8_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_2d_array_v4i16_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_2D_ARRAY_V4B16_TRAP_R Int64Regs:$s, - Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_2d_array_v4i32_trap - Int64Regs:$s, Int32Regs:$l, Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_P_2D_ARRAY_V4B32_TRAP_R Int64Regs:$s, Int32Regs:$l, - Int32Regs:$x, Int32Regs:$y, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; - - - -def : Pat<(int_nvvm_sust_p_3d_i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_P_3D_B8_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_3d_i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r), - (SUST_P_3D_B16_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_3d_i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r), - (SUST_P_3D_B32_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r)>; - -def : Pat<(int_nvvm_sust_p_3d_v2i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_P_3D_V2B8_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_3d_v2i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g), - (SUST_P_3D_V2B16_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_3d_v2i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g), - (SUST_P_3D_V2B32_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g)>; - -def : Pat<(int_nvvm_sust_p_3d_v4i8_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_3D_V4B8_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_3d_v4i16_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a), - (SUST_P_3D_V4B16_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int16Regs:$r, Int16Regs:$g, Int16Regs:$b, Int16Regs:$a)>; - -def : Pat<(int_nvvm_sust_p_3d_v4i32_trap - Int64Regs:$s, Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a), - (SUST_P_3D_V4B32_TRAP_R Int64Regs:$s, - Int32Regs:$x, Int32Regs:$y, Int32Regs:$z, - Int32Regs:$r, Int32Regs:$g, Int32Regs:$b, Int32Regs:$a)>; //----------------------------------- // Read Special Registers @@ -6411,13 +5040,13 @@ def : Pat<(int_nvvm_sust_p_3d_v4i32_trap class PTX_READ_SREG_R64 Preds=[]> : NVPTXInst<(outs Int64Regs:$d), (ins), - !strconcat("mov.u64 \t$d, %", regname, ";"), + "mov.u64 \t$d, %" # regname # ";", [(set i64:$d, (intop))]>, Requires; class PTX_READ_SREG_R32 Preds=[]> : NVPTXInst<(outs Int32Regs:$d), (ins), - !strconcat("mov.u32 \t$d, %", regname, ";"), + "mov.u32 \t$d, %" # regname # ";", [(set i32:$d, (intop))]>, Requires; @@ -6547,7 +5176,7 @@ class WMMA_REGINFO !or(!eq(ptx_elt_type, "f16"), !eq(ptx_elt_type, "f32"))) : [hasSM<70>, hasPTX<60>], - !and(!eq(geom,"m8n8k4"), + !and(!eq(geom, "m8n8k4"), !eq(ptx_elt_type, "f64")) : [hasSM<80>, hasPTX<70>], // fp16 -> fp16/fp32 @ m8n32k16/m32n8k16 @@ -6557,46 +5186,46 @@ class WMMA_REGINFO !eq(ptx_elt_type, "f32"))) : [hasSM<70>, hasPTX<61>], // u8/s8 -> s32 @ m16n16k16/m8n32k16/m32n8k16 - !and(!or(!eq(geom,"m16n16k16"), - !eq(geom,"m8n32k16"), - !eq(geom,"m32n8k16")), + !and(!or(!eq(geom, "m16n16k16"), + !eq(geom, "m8n32k16"), + !eq(geom, "m32n8k16")), !or(!eq(ptx_elt_type, "u8"), !eq(ptx_elt_type, "s8"), !eq(ptx_elt_type, "s32"))) : [hasSM<72>, hasPTX<63>], - !and(!or(!eq(geom,"m16n16k16"), - !eq(geom,"m8n32k16"), - !eq(geom,"m32n8k16")), + !and(!or(!eq(geom, "m16n16k16"), + !eq(geom, "m8n32k16"), + !eq(geom, "m32n8k16")), !eq(ptx_elt_type, "bf16")) : [hasSM<80>, hasPTX<70>], - !and(!eq(geom,"m16n16k8"), + !and(!eq(geom, "m16n16k8"), !eq(ptx_elt_type, "tf32")) : [hasSM<80>, hasPTX<70>], - !and(!eq(geom,"m16n16k8"), + !and(!eq(geom, "m16n16k8"), !eq(ptx_elt_type, "f32")) : [hasSM<80>, hasPTX<70>], // b1 -> s32 @ m8n8k128(b1) - !and(!ne(op,"mma"), - !eq(geom,"m8n8k128")) : [hasSM<75>, hasPTX<63>], + !and(!ne(op, "mma"), + !eq(geom, "m8n8k128")) : [hasSM<75>, hasPTX<63>], // u4/s4 -> s32 @ m8n8k32 (u4/s4) - !and(!ne(op,"mma"), - !eq(geom,"m8n8k32")) : [hasSM<75>, hasPTX<63>], + !and(!ne(op, "mma"), + !eq(geom, "m8n8k32")) : [hasSM<75>, hasPTX<63>], - !or(!eq(geom,"m16n8k8"), - !eq(geom,"m8n8k16")) : [hasSM<75>, hasPTX<65>], + !or(!eq(geom, "m16n8k8"), + !eq(geom, "m8n8k16")) : [hasSM<75>, hasPTX<65>], - !and(!ne(ptx_elt_type,"f64"), + !and(!ne(ptx_elt_type, "f64"), !eq(geom, "m8n8k4")) : [hasSM<70>, hasPTX<64>], // mma m8n8k32 requires higher PTX version - !and(!eq(op,"mma"), - !eq(geom,"m8n8k32")) : [hasSM<75>, hasPTX<65>], + !and(!eq(op, "mma"), + !eq(geom, "m8n8k32")) : [hasSM<75>, hasPTX<65>], - !and(!eq(ptx_elt_type,"f64"), + !and(!eq(ptx_elt_type, "f64"), !eq(geom, "m8n8k4")) : [hasSM<80>, hasPTX<70>], - !and(!eq(op,"mma"), + !and(!eq(op, "mma"), !or(!eq(geom, "m16n8k16"), !eq(geom, "m16n8k4"), !eq(geom, "m16n8k32"), @@ -6605,28 +5234,28 @@ class WMMA_REGINFO !eq(geom, "m16n8k128"), !eq(geom, "m16n8k256"))) : [hasSM<80>, hasPTX<70>], - !and(!eq(op,"ldmatrix"), - !eq(ptx_elt_type,"b16"), + !and(!eq(op, "ldmatrix"), + !eq(ptx_elt_type, "b16"), !eq(geom, "m8n8")) : [hasSM<75>, hasPTX<65>], - !and(!eq(op,"ldmatrix"), - !eq(ptx_elt_type,"b8"), + !and(!eq(op, "ldmatrix"), + !eq(ptx_elt_type, "b8"), !eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>], - !and(!eq(op,"ldmatrix"), - !eq(ptx_elt_type,"b8x16.b6x16_p32"), + !and(!eq(op, "ldmatrix"), + !eq(ptx_elt_type, "b8x16.b6x16_p32"), !eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>], - !and(!eq(op,"ldmatrix"), - !eq(ptx_elt_type,"b8x16.b4x16_p64"), + !and(!eq(op, "ldmatrix"), + !eq(ptx_elt_type, "b8x16.b4x16_p64"), !eq(geom, "m16n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>], - !and(!eq(op,"ldmatrix"), - !eq(ptx_elt_type,"b8x16.b6x16_p32"), + !and(!eq(op, "ldmatrix"), + !eq(ptx_elt_type, "b8x16.b6x16_p32"), !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>], - !and(!eq(op,"ldmatrix"), - !eq(ptx_elt_type,"b8x16.b4x16_p64"), + !and(!eq(op, "ldmatrix"), + !eq(ptx_elt_type, "b8x16.b4x16_p64"), !eq(geom, "m8n16")) : [hasSM<100>, hasArchAccelFeatures, hasPTX<86>]); // template DAGs for instruction inputs/output. @@ -6655,7 +5284,7 @@ class WMMA_INSTR _Args> : NVPTXInst<(outs), (ins), "?", []> { Intrinsic Intr = !cast(_Intr); // Concatenate all arguments into a single dag. - dag Args = !foldl((ins), _Args, a, b, !con(a,b)); + dag Args = !foldl((ins), _Args, a, b, !con(a, b)); // Pre-build the pattern to match (intrinsic arg0, arg1, ...). dag IntrinsicPattern = BuildPatternI(Intr), Args>.ret; } @@ -6761,7 +5390,7 @@ class MMA_OP_PREDICATES { WMMA_REGINFO Frag = FragA; list ret = !listconcat( FragA.Predicates, - !if(!eq(b1op, ".and.popc"), [hasSM<80>,hasPTX<71>],[]) + !if(!eq(b1op, ".and.popc"), [hasSM<80>, hasPTX<71>], []) ); } // WMMA.MMA @@ -7008,25 +5637,22 @@ def INT_EXIT : BasicNVPTXInst<(outs), (ins), "exit", [(int_nvvm_exit)]>; // Tcgen05 intrinsics let isConvergent = true in { -multiclass TCGEN05_ALLOC_INTR { - def NAME : NVPTXInst<(outs), - (ins rc:$dst, Int32Regs:$ncols), - !strconcat("tcgen05.alloc.cta_group::", num, ".sync.aligned", AS, ".b32 [$dst], $ncols;"), - [(Intr rc:$dst, Int32Regs:$ncols)]>, +multiclass TCGEN05_ALLOC_INTR { + def "" : BasicNVPTXInst<(outs), + (ins ADDR:$dst, Int32Regs:$ncols), + "tcgen05.alloc.cta_group::" # num # ".sync.aligned" # AS # ".b32", + [(Intr addr:$dst, Int32Regs:$ncols)]>, Requires<[hasTcgen05Instructions]>; } -defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR; -defm TCGEN05_ALLOC_CG2 : TCGEN05_ALLOC_INTR; +defm TCGEN05_ALLOC_CG1 : TCGEN05_ALLOC_INTR<"", "1", int_nvvm_tcgen05_alloc_cg1>; +defm TCGEN05_ALLOC_CG2 : TCGEN05_ALLOC_INTR<"", "2", int_nvvm_tcgen05_alloc_cg2>; -defm TCGEN05_ALLOC_S64_CG1 : TCGEN05_ALLOC_INTR; -defm TCGEN05_ALLOC_S64_CG2 : TCGEN05_ALLOC_INTR; - -defm TCGEN05_ALLOC_S32_CG1 : TCGEN05_ALLOC_INTR; -defm TCGEN05_ALLOC_S32_CG2 : TCGEN05_ALLOC_INTR; +defm TCGEN05_ALLOC_S64_CG1 : TCGEN05_ALLOC_INTR<".shared::cta", "1", int_nvvm_tcgen05_alloc_shared_cg1>; +defm TCGEN05_ALLOC_S64_CG2 : TCGEN05_ALLOC_INTR<".shared::cta", "2", int_nvvm_tcgen05_alloc_shared_cg2>; multiclass TCGEN05_DEALLOC_INTR { - def NAME : BasicNVPTXInst<(outs), + def "" : BasicNVPTXInst<(outs), (ins Int32Regs:$tmem_addr, Int32Regs:$ncols), "tcgen05.dealloc.cta_group::" # num # ".sync.aligned.b32", [(Intr Int32Regs:$tmem_addr, Int32Regs:$ncols)]>, @@ -7036,7 +5662,7 @@ defm TCGEN05_DEALLOC_CG1: TCGEN05_DEALLOC_INTR<"1", int_nvvm_tcgen05_dealloc_cg1 defm TCGEN05_DEALLOC_CG2: TCGEN05_DEALLOC_INTR<"2", int_nvvm_tcgen05_dealloc_cg2>; multiclass TCGEN05_RELINQ_PERMIT_INTR { - def NAME : BasicNVPTXInst<(outs), (ins), + def "" : BasicNVPTXInst<(outs), (ins), "tcgen05.relinquish_alloc_permit.cta_group::" # num # ".sync.aligned", [(Intr)]>, Requires<[hasTcgen05Instructions]>; @@ -7052,36 +5678,33 @@ def tcgen05_wait_st: BasicNVPTXInst<(outs), (ins), "tcgen05.wait::st.sync.aligne [(int_nvvm_tcgen05_wait_st)]>, Requires<[hasTcgen05Instructions]>; -multiclass TCGEN05_COMMIT_INTR { - defvar prefix = "tcgen05.commit.cta_group::" # num; - defvar suffix = ".mbarrier::arrive::one.shared::cluster"; +multiclass TCGEN05_COMMIT_INTR { + defvar prefix = "tcgen05.commit.cta_group::" # num #".mbarrier::arrive::one.shared::cluster"; defvar intr_suffix = !if(!eq(AS, "shared"), "_shared", "") # "_cg" # num; defvar Intr = !cast("int_nvvm_tcgen05_commit" # intr_suffix); defvar IntrMC = !cast("int_nvvm_tcgen05_commit_mc" # intr_suffix); - def NAME : NVPTXInst<(outs), (ins rc:$mbar), - !strconcat(prefix, suffix, ".b64 [$mbar];"), - [(Intr rc:$mbar)]>, + def "" : BasicNVPTXInst<(outs), (ins ADDR:$mbar), + prefix # ".b64", + [(Intr addr:$mbar)]>, Requires<[hasTcgen05Instructions]>; - def NAME # _MC : NVPTXInst<(outs), (ins rc:$mbar, Int16Regs:$mc), - !strconcat(prefix, suffix, ".multicast::cluster.b64 [$mbar], $mc;"), - [(IntrMC rc:$mbar, Int16Regs:$mc)]>, + def _MC : BasicNVPTXInst<(outs), (ins ADDR:$mbar, Int16Regs:$mc), + prefix # ".multicast::cluster.b64", + [(IntrMC addr:$mbar, Int16Regs:$mc)]>, Requires<[hasTcgen05Instructions]>; } -defm TCGEN05_COMMIT_CG1 : TCGEN05_COMMIT_INTR; -defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR; -defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR; -defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR; -defm TCGEN05_COMMIT_S32_CG1 : TCGEN05_COMMIT_INTR; -defm TCGEN05_COMMIT_S32_CG2 : TCGEN05_COMMIT_INTR; +defm TCGEN05_COMMIT_CG1 : TCGEN05_COMMIT_INTR<"", "1">; +defm TCGEN05_COMMIT_CG2 : TCGEN05_COMMIT_INTR<"", "2">; +defm TCGEN05_COMMIT_S64_CG1 : TCGEN05_COMMIT_INTR<"shared", "1">; +defm TCGEN05_COMMIT_S64_CG2 : TCGEN05_COMMIT_INTR<"shared", "2">; multiclass TCGEN05_SHIFT_INTR { - def NAME : NVPTXInst<(outs), - (ins Int32Regs:$tmem_addr), - !strconcat("tcgen05.shift.cta_group::", num, ".down [$tmem_addr];"), - [(Intr Int32Regs:$tmem_addr)]>, + def "" : BasicNVPTXInst<(outs), + (ins ADDR:$tmem_addr), + "tcgen05.shift.cta_group::" # num # ".down", + [(Intr addr:$tmem_addr)]>, Requires<[hasTcgen05Instructions]>; } defm TCGEN05_SHIFT_CG1: TCGEN05_SHIFT_INTR<"1", int_nvvm_tcgen05_shift_down_cg1>; @@ -7099,15 +5722,15 @@ multiclass TCGEN05_CP_INTR { defvar IntrCG1 = !cast(intr_prefix # "_cg1"); defvar IntrCG2 = !cast(intr_prefix # "_cg2"); - def NAME # _cg1 : NVPTXInst<(outs), - (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc), - "tcgen05.cp.cta_group::1." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;", - [(IntrCG1 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>, + def _cg1 : BasicNVPTXInst<(outs), + (ins ADDR:$tmem_addr, Int64Regs:$sdesc), + "tcgen05.cp.cta_group::1." # shape_mc_asm # fmt_asm, + [(IntrCG1 addr:$tmem_addr, Int64Regs:$sdesc)]>, Requires<[hasTcgen05Instructions]>; - def NAME # _cg2 : NVPTXInst<(outs), - (ins Int32Regs:$tmem_addr, Int64Regs:$sdesc), - "tcgen05.cp.cta_group::2." # shape_mc_asm # fmt_asm # " [$tmem_addr], $sdesc;", - [(IntrCG2 Int32Regs:$tmem_addr, Int64Regs:$sdesc)]>, + def _cg2 : BasicNVPTXInst<(outs), + (ins ADDR:$tmem_addr, Int64Regs:$sdesc), + "tcgen05.cp.cta_group::2." # shape_mc_asm # fmt_asm, + [(IntrCG2 addr:$tmem_addr, Int64Regs:$sdesc)]>, Requires<[hasTcgen05Instructions]>; } @@ -7222,17 +5845,18 @@ foreach shape = ["16x64b", "16x128b", "16x256b", "32x32b", "16x32bx2"] in { } // isConvergent // Bulk store instructions - +def st_bulk_imm : TImmLeaf; + def INT_NVVM_ST_BULK_GENERIC : - NVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size), - "st.bulk [$dest_addr], $size, 0;", - [(int_nvvm_st_bulk addr:$dest_addr, i64:$size, (i64 0))]>, + BasicNVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size, i64imm:$value), + "st.bulk", + [(int_nvvm_st_bulk addr:$dest_addr, i64:$size, st_bulk_imm:$value)]>, Requires<[hasSM<100>, hasPTX<86>]>; def INT_NVVM_ST_BULK_SHARED_CTA: - NVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size), - "st.bulk.shared::cta [$dest_addr], $size, 0;", - [(int_nvvm_st_bulk_shared_cta addr:$dest_addr, i64:$size, (i64 0))]>, + BasicNVPTXInst<(outs), (ins ADDR:$dest_addr, Int64Regs:$size, i64imm:$value), + "st.bulk.shared::cta", + [(int_nvvm_st_bulk_shared_cta addr:$dest_addr, i64:$size, st_bulk_imm:$value)]>, Requires<[hasSM<100>, hasPTX<86>]>; // @@ -7240,17 +5864,15 @@ def INT_NVVM_ST_BULK_SHARED_CTA: // def CLUSTERLAUNCHCONTRL_TRY_CANCEL: - NVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar), - "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128 " # - "[$addr], [$mbar];", + BasicNVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar), + "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes.b128", [(int_nvvm_clusterlaunchcontrol_try_cancel_async_shared addr:$addr, addr:$mbar)]>, Requires<[hasSM<100>, hasPTX<86>]>; def CLUSTERLAUNCHCONTRL_TRY_CANCEL_MULTICAST: - NVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar), + BasicNVPTXInst<(outs), (ins ADDR:$addr, ADDR:$mbar), "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::complete_tx::bytes" # - ".multicast::cluster::all.b128 " # - "[$addr], [$mbar];", + ".multicast::cluster::all.b128", [(int_nvvm_clusterlaunchcontrol_try_cancel_async_multicast_shared addr:$addr, addr:$mbar)]>, Requires<[hasSM<100>, hasArchAccelFeatures, hasPTX<86>]>; diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp index 9b5fe473521a1..320c0fb6950a7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp @@ -408,426 +408,426 @@ static unsigned suldRegisterToIndexOpcode(unsigned RegOC) { static unsigned sustRegisterToIndexOpcode(unsigned RegOC) { switch (RegOC) { - case NVPTX::SUST_B_1D_B8_CLAMP_R: - return NVPTX::SUST_B_1D_B8_CLAMP_I; - case NVPTX::SUST_B_1D_B16_CLAMP_R: - return NVPTX::SUST_B_1D_B16_CLAMP_I; - case NVPTX::SUST_B_1D_B32_CLAMP_R: - return NVPTX::SUST_B_1D_B32_CLAMP_I; - case NVPTX::SUST_B_1D_B64_CLAMP_R: - return NVPTX::SUST_B_1D_B64_CLAMP_I; - case NVPTX::SUST_B_1D_V2B8_CLAMP_R: - return NVPTX::SUST_B_1D_V2B8_CLAMP_I; - case NVPTX::SUST_B_1D_V2B16_CLAMP_R: - return NVPTX::SUST_B_1D_V2B16_CLAMP_I; - case NVPTX::SUST_B_1D_V2B32_CLAMP_R: - return NVPTX::SUST_B_1D_V2B32_CLAMP_I; - case NVPTX::SUST_B_1D_V2B64_CLAMP_R: - return NVPTX::SUST_B_1D_V2B64_CLAMP_I; - case NVPTX::SUST_B_1D_V4B8_CLAMP_R: - return NVPTX::SUST_B_1D_V4B8_CLAMP_I; - case NVPTX::SUST_B_1D_V4B16_CLAMP_R: - return NVPTX::SUST_B_1D_V4B16_CLAMP_I; - case NVPTX::SUST_B_1D_V4B32_CLAMP_R: - return NVPTX::SUST_B_1D_V4B32_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_B8_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_B8_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_B16_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_B16_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_B32_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_B32_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_B64_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_B64_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B8_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B8_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B16_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B16_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B32_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B32_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B64_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B64_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_V4B8_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_V4B8_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_V4B16_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_V4B16_CLAMP_I; - case NVPTX::SUST_B_1D_ARRAY_V4B32_CLAMP_R: - return NVPTX::SUST_B_1D_ARRAY_V4B32_CLAMP_I; - case NVPTX::SUST_B_2D_B8_CLAMP_R: - return NVPTX::SUST_B_2D_B8_CLAMP_I; - case NVPTX::SUST_B_2D_B16_CLAMP_R: - return NVPTX::SUST_B_2D_B16_CLAMP_I; - case NVPTX::SUST_B_2D_B32_CLAMP_R: - return NVPTX::SUST_B_2D_B32_CLAMP_I; - case NVPTX::SUST_B_2D_B64_CLAMP_R: - return NVPTX::SUST_B_2D_B64_CLAMP_I; - case NVPTX::SUST_B_2D_V2B8_CLAMP_R: - return NVPTX::SUST_B_2D_V2B8_CLAMP_I; - case NVPTX::SUST_B_2D_V2B16_CLAMP_R: - return NVPTX::SUST_B_2D_V2B16_CLAMP_I; - case NVPTX::SUST_B_2D_V2B32_CLAMP_R: - return NVPTX::SUST_B_2D_V2B32_CLAMP_I; - case NVPTX::SUST_B_2D_V2B64_CLAMP_R: - return NVPTX::SUST_B_2D_V2B64_CLAMP_I; - case NVPTX::SUST_B_2D_V4B8_CLAMP_R: - return NVPTX::SUST_B_2D_V4B8_CLAMP_I; - case NVPTX::SUST_B_2D_V4B16_CLAMP_R: - return NVPTX::SUST_B_2D_V4B16_CLAMP_I; - case NVPTX::SUST_B_2D_V4B32_CLAMP_R: - return NVPTX::SUST_B_2D_V4B32_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_B8_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_B8_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_B16_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_B16_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_B32_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_B32_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_B64_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_B64_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B8_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B8_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B16_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B16_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B32_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B32_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B64_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B64_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_V4B8_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_V4B8_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_V4B16_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_V4B16_CLAMP_I; - case NVPTX::SUST_B_2D_ARRAY_V4B32_CLAMP_R: - return NVPTX::SUST_B_2D_ARRAY_V4B32_CLAMP_I; - case NVPTX::SUST_B_3D_B8_CLAMP_R: - return NVPTX::SUST_B_3D_B8_CLAMP_I; - case NVPTX::SUST_B_3D_B16_CLAMP_R: - return NVPTX::SUST_B_3D_B16_CLAMP_I; - case NVPTX::SUST_B_3D_B32_CLAMP_R: - return NVPTX::SUST_B_3D_B32_CLAMP_I; - case NVPTX::SUST_B_3D_B64_CLAMP_R: - return NVPTX::SUST_B_3D_B64_CLAMP_I; - case NVPTX::SUST_B_3D_V2B8_CLAMP_R: - return NVPTX::SUST_B_3D_V2B8_CLAMP_I; - case NVPTX::SUST_B_3D_V2B16_CLAMP_R: - return NVPTX::SUST_B_3D_V2B16_CLAMP_I; - case NVPTX::SUST_B_3D_V2B32_CLAMP_R: - return NVPTX::SUST_B_3D_V2B32_CLAMP_I; - case NVPTX::SUST_B_3D_V2B64_CLAMP_R: - return NVPTX::SUST_B_3D_V2B64_CLAMP_I; - case NVPTX::SUST_B_3D_V4B8_CLAMP_R: - return NVPTX::SUST_B_3D_V4B8_CLAMP_I; - case NVPTX::SUST_B_3D_V4B16_CLAMP_R: - return NVPTX::SUST_B_3D_V4B16_CLAMP_I; - case NVPTX::SUST_B_3D_V4B32_CLAMP_R: - return NVPTX::SUST_B_3D_V4B32_CLAMP_I; - case NVPTX::SUST_B_1D_B8_TRAP_R: - return NVPTX::SUST_B_1D_B8_TRAP_I; - case NVPTX::SUST_B_1D_B16_TRAP_R: - return NVPTX::SUST_B_1D_B16_TRAP_I; - case NVPTX::SUST_B_1D_B32_TRAP_R: - return NVPTX::SUST_B_1D_B32_TRAP_I; - case NVPTX::SUST_B_1D_B64_TRAP_R: - return NVPTX::SUST_B_1D_B64_TRAP_I; - case NVPTX::SUST_B_1D_V2B8_TRAP_R: - return NVPTX::SUST_B_1D_V2B8_TRAP_I; - case NVPTX::SUST_B_1D_V2B16_TRAP_R: - return NVPTX::SUST_B_1D_V2B16_TRAP_I; - case NVPTX::SUST_B_1D_V2B32_TRAP_R: - return NVPTX::SUST_B_1D_V2B32_TRAP_I; - case NVPTX::SUST_B_1D_V2B64_TRAP_R: - return NVPTX::SUST_B_1D_V2B64_TRAP_I; - case NVPTX::SUST_B_1D_V4B8_TRAP_R: - return NVPTX::SUST_B_1D_V4B8_TRAP_I; - case NVPTX::SUST_B_1D_V4B16_TRAP_R: - return NVPTX::SUST_B_1D_V4B16_TRAP_I; - case NVPTX::SUST_B_1D_V4B32_TRAP_R: - return NVPTX::SUST_B_1D_V4B32_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_B8_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_B8_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_B16_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_B16_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_B32_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_B32_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_B64_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_B64_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B8_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B16_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B32_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_V2B64_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_V2B64_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_V4B8_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_V4B16_TRAP_I; - case NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP_R: - return NVPTX::SUST_B_1D_ARRAY_V4B32_TRAP_I; - case NVPTX::SUST_B_2D_B8_TRAP_R: - return NVPTX::SUST_B_2D_B8_TRAP_I; - case NVPTX::SUST_B_2D_B16_TRAP_R: - return NVPTX::SUST_B_2D_B16_TRAP_I; - case NVPTX::SUST_B_2D_B32_TRAP_R: - return NVPTX::SUST_B_2D_B32_TRAP_I; - case NVPTX::SUST_B_2D_B64_TRAP_R: - return NVPTX::SUST_B_2D_B64_TRAP_I; - case NVPTX::SUST_B_2D_V2B8_TRAP_R: - return NVPTX::SUST_B_2D_V2B8_TRAP_I; - case NVPTX::SUST_B_2D_V2B16_TRAP_R: - return NVPTX::SUST_B_2D_V2B16_TRAP_I; - case NVPTX::SUST_B_2D_V2B32_TRAP_R: - return NVPTX::SUST_B_2D_V2B32_TRAP_I; - case NVPTX::SUST_B_2D_V2B64_TRAP_R: - return NVPTX::SUST_B_2D_V2B64_TRAP_I; - case NVPTX::SUST_B_2D_V4B8_TRAP_R: - return NVPTX::SUST_B_2D_V4B8_TRAP_I; - case NVPTX::SUST_B_2D_V4B16_TRAP_R: - return NVPTX::SUST_B_2D_V4B16_TRAP_I; - case NVPTX::SUST_B_2D_V4B32_TRAP_R: - return NVPTX::SUST_B_2D_V4B32_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_B8_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_B8_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_B16_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_B16_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_B32_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_B32_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_B64_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_B64_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B8_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B16_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B32_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_V2B64_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_V2B64_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_V4B8_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_V4B16_TRAP_I; - case NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP_R: - return NVPTX::SUST_B_2D_ARRAY_V4B32_TRAP_I; - case NVPTX::SUST_B_3D_B8_TRAP_R: - return NVPTX::SUST_B_3D_B8_TRAP_I; - case NVPTX::SUST_B_3D_B16_TRAP_R: - return NVPTX::SUST_B_3D_B16_TRAP_I; - case NVPTX::SUST_B_3D_B32_TRAP_R: - return NVPTX::SUST_B_3D_B32_TRAP_I; - case NVPTX::SUST_B_3D_B64_TRAP_R: - return NVPTX::SUST_B_3D_B64_TRAP_I; - case NVPTX::SUST_B_3D_V2B8_TRAP_R: - return NVPTX::SUST_B_3D_V2B8_TRAP_I; - case NVPTX::SUST_B_3D_V2B16_TRAP_R: - return NVPTX::SUST_B_3D_V2B16_TRAP_I; - case NVPTX::SUST_B_3D_V2B32_TRAP_R: - return NVPTX::SUST_B_3D_V2B32_TRAP_I; - case NVPTX::SUST_B_3D_V2B64_TRAP_R: - return NVPTX::SUST_B_3D_V2B64_TRAP_I; - case NVPTX::SUST_B_3D_V4B8_TRAP_R: - return NVPTX::SUST_B_3D_V4B8_TRAP_I; - case NVPTX::SUST_B_3D_V4B16_TRAP_R: - return NVPTX::SUST_B_3D_V4B16_TRAP_I; - case NVPTX::SUST_B_3D_V4B32_TRAP_R: - return NVPTX::SUST_B_3D_V4B32_TRAP_I; - case NVPTX::SUST_B_1D_B8_ZERO_R: - return NVPTX::SUST_B_1D_B8_ZERO_I; - case NVPTX::SUST_B_1D_B16_ZERO_R: - return NVPTX::SUST_B_1D_B16_ZERO_I; - case NVPTX::SUST_B_1D_B32_ZERO_R: - return NVPTX::SUST_B_1D_B32_ZERO_I; - case NVPTX::SUST_B_1D_B64_ZERO_R: - return NVPTX::SUST_B_1D_B64_ZERO_I; - case NVPTX::SUST_B_1D_V2B8_ZERO_R: - return NVPTX::SUST_B_1D_V2B8_ZERO_I; - case NVPTX::SUST_B_1D_V2B16_ZERO_R: - return NVPTX::SUST_B_1D_V2B16_ZERO_I; - case NVPTX::SUST_B_1D_V2B32_ZERO_R: - return NVPTX::SUST_B_1D_V2B32_ZERO_I; - case NVPTX::SUST_B_1D_V2B64_ZERO_R: - return NVPTX::SUST_B_1D_V2B64_ZERO_I; - case NVPTX::SUST_B_1D_V4B8_ZERO_R: - return NVPTX::SUST_B_1D_V4B8_ZERO_I; - case NVPTX::SUST_B_1D_V4B16_ZERO_R: - return NVPTX::SUST_B_1D_V4B16_ZERO_I; - case NVPTX::SUST_B_1D_V4B32_ZERO_R: - return NVPTX::SUST_B_1D_V4B32_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_B8_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_B8_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_B16_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_B16_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_B32_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_B32_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_B64_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_B64_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_V2B8_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_V2B8_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_V2B16_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_V2B16_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_V2B32_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_V2B32_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_V2B64_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_V2B64_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_V4B8_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_V4B8_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_V4B16_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_V4B16_ZERO_I; - case NVPTX::SUST_B_1D_ARRAY_V4B32_ZERO_R: - return NVPTX::SUST_B_1D_ARRAY_V4B32_ZERO_I; - case NVPTX::SUST_B_2D_B8_ZERO_R: - return NVPTX::SUST_B_2D_B8_ZERO_I; - case NVPTX::SUST_B_2D_B16_ZERO_R: - return NVPTX::SUST_B_2D_B16_ZERO_I; - case NVPTX::SUST_B_2D_B32_ZERO_R: - return NVPTX::SUST_B_2D_B32_ZERO_I; - case NVPTX::SUST_B_2D_B64_ZERO_R: - return NVPTX::SUST_B_2D_B64_ZERO_I; - case NVPTX::SUST_B_2D_V2B8_ZERO_R: - return NVPTX::SUST_B_2D_V2B8_ZERO_I; - case NVPTX::SUST_B_2D_V2B16_ZERO_R: - return NVPTX::SUST_B_2D_V2B16_ZERO_I; - case NVPTX::SUST_B_2D_V2B32_ZERO_R: - return NVPTX::SUST_B_2D_V2B32_ZERO_I; - case NVPTX::SUST_B_2D_V2B64_ZERO_R: - return NVPTX::SUST_B_2D_V2B64_ZERO_I; - case NVPTX::SUST_B_2D_V4B8_ZERO_R: - return NVPTX::SUST_B_2D_V4B8_ZERO_I; - case NVPTX::SUST_B_2D_V4B16_ZERO_R: - return NVPTX::SUST_B_2D_V4B16_ZERO_I; - case NVPTX::SUST_B_2D_V4B32_ZERO_R: - return NVPTX::SUST_B_2D_V4B32_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_B8_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_B8_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_B16_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_B16_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_B32_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_B32_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_B64_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_B64_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_V2B8_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_V2B8_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_V2B16_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_V2B16_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_V2B32_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_V2B32_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_V2B64_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_V2B64_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_V4B8_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_V4B8_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_V4B16_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_V4B16_ZERO_I; - case NVPTX::SUST_B_2D_ARRAY_V4B32_ZERO_R: - return NVPTX::SUST_B_2D_ARRAY_V4B32_ZERO_I; - case NVPTX::SUST_B_3D_B8_ZERO_R: - return NVPTX::SUST_B_3D_B8_ZERO_I; - case NVPTX::SUST_B_3D_B16_ZERO_R: - return NVPTX::SUST_B_3D_B16_ZERO_I; - case NVPTX::SUST_B_3D_B32_ZERO_R: - return NVPTX::SUST_B_3D_B32_ZERO_I; - case NVPTX::SUST_B_3D_B64_ZERO_R: - return NVPTX::SUST_B_3D_B64_ZERO_I; - case NVPTX::SUST_B_3D_V2B8_ZERO_R: - return NVPTX::SUST_B_3D_V2B8_ZERO_I; - case NVPTX::SUST_B_3D_V2B16_ZERO_R: - return NVPTX::SUST_B_3D_V2B16_ZERO_I; - case NVPTX::SUST_B_3D_V2B32_ZERO_R: - return NVPTX::SUST_B_3D_V2B32_ZERO_I; - case NVPTX::SUST_B_3D_V2B64_ZERO_R: - return NVPTX::SUST_B_3D_V2B64_ZERO_I; - case NVPTX::SUST_B_3D_V4B8_ZERO_R: - return NVPTX::SUST_B_3D_V4B8_ZERO_I; - case NVPTX::SUST_B_3D_V4B16_ZERO_R: - return NVPTX::SUST_B_3D_V4B16_ZERO_I; - case NVPTX::SUST_B_3D_V4B32_ZERO_R: - return NVPTX::SUST_B_3D_V4B32_ZERO_I; - case NVPTX::SUST_P_1D_B8_TRAP_R: - return NVPTX::SUST_P_1D_B8_TRAP_I; - case NVPTX::SUST_P_1D_B16_TRAP_R: - return NVPTX::SUST_P_1D_B16_TRAP_I; - case NVPTX::SUST_P_1D_B32_TRAP_R: - return NVPTX::SUST_P_1D_B32_TRAP_I; - case NVPTX::SUST_P_1D_V2B8_TRAP_R: - return NVPTX::SUST_P_1D_V2B8_TRAP_I; - case NVPTX::SUST_P_1D_V2B16_TRAP_R: - return NVPTX::SUST_P_1D_V2B16_TRAP_I; - case NVPTX::SUST_P_1D_V2B32_TRAP_R: - return NVPTX::SUST_P_1D_V2B32_TRAP_I; - case NVPTX::SUST_P_1D_V4B8_TRAP_R: - return NVPTX::SUST_P_1D_V4B8_TRAP_I; - case NVPTX::SUST_P_1D_V4B16_TRAP_R: - return NVPTX::SUST_P_1D_V4B16_TRAP_I; - case NVPTX::SUST_P_1D_V4B32_TRAP_R: - return NVPTX::SUST_P_1D_V4B32_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_B8_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_B8_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_B16_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_B16_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_B32_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_B32_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_V2B8_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_V2B16_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_V2B32_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_V4B8_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_V4B16_TRAP_I; - case NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP_R: - return NVPTX::SUST_P_1D_ARRAY_V4B32_TRAP_I; - case NVPTX::SUST_P_2D_B8_TRAP_R: - return NVPTX::SUST_P_2D_B8_TRAP_I; - case NVPTX::SUST_P_2D_B16_TRAP_R: - return NVPTX::SUST_P_2D_B16_TRAP_I; - case NVPTX::SUST_P_2D_B32_TRAP_R: - return NVPTX::SUST_P_2D_B32_TRAP_I; - case NVPTX::SUST_P_2D_V2B8_TRAP_R: - return NVPTX::SUST_P_2D_V2B8_TRAP_I; - case NVPTX::SUST_P_2D_V2B16_TRAP_R: - return NVPTX::SUST_P_2D_V2B16_TRAP_I; - case NVPTX::SUST_P_2D_V2B32_TRAP_R: - return NVPTX::SUST_P_2D_V2B32_TRAP_I; - case NVPTX::SUST_P_2D_V4B8_TRAP_R: - return NVPTX::SUST_P_2D_V4B8_TRAP_I; - case NVPTX::SUST_P_2D_V4B16_TRAP_R: - return NVPTX::SUST_P_2D_V4B16_TRAP_I; - case NVPTX::SUST_P_2D_V4B32_TRAP_R: - return NVPTX::SUST_P_2D_V4B32_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_B8_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_B8_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_B16_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_B16_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_B32_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_B32_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_V2B8_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_V2B16_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_V2B32_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_V4B8_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_V4B16_TRAP_I; - case NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP_R: - return NVPTX::SUST_P_2D_ARRAY_V4B32_TRAP_I; - case NVPTX::SUST_P_3D_B8_TRAP_R: - return NVPTX::SUST_P_3D_B8_TRAP_I; - case NVPTX::SUST_P_3D_B16_TRAP_R: - return NVPTX::SUST_P_3D_B16_TRAP_I; - case NVPTX::SUST_P_3D_B32_TRAP_R: - return NVPTX::SUST_P_3D_B32_TRAP_I; - case NVPTX::SUST_P_3D_V2B8_TRAP_R: - return NVPTX::SUST_P_3D_V2B8_TRAP_I; - case NVPTX::SUST_P_3D_V2B16_TRAP_R: - return NVPTX::SUST_P_3D_V2B16_TRAP_I; - case NVPTX::SUST_P_3D_V2B32_TRAP_R: - return NVPTX::SUST_P_3D_V2B32_TRAP_I; - case NVPTX::SUST_P_3D_V4B8_TRAP_R: - return NVPTX::SUST_P_3D_V4B8_TRAP_I; - case NVPTX::SUST_P_3D_V4B16_TRAP_R: - return NVPTX::SUST_P_3D_V4B16_TRAP_I; - case NVPTX::SUST_P_3D_V4B32_TRAP_R: - return NVPTX::SUST_P_3D_V4B32_TRAP_I; + case NVPTX::SUST_B_1D_I8_CLAMP_R: + return NVPTX::SUST_B_1D_I8_CLAMP_I; + case NVPTX::SUST_B_1D_I16_CLAMP_R: + return NVPTX::SUST_B_1D_I16_CLAMP_I; + case NVPTX::SUST_B_1D_I32_CLAMP_R: + return NVPTX::SUST_B_1D_I32_CLAMP_I; + case NVPTX::SUST_B_1D_I64_CLAMP_R: + return NVPTX::SUST_B_1D_I64_CLAMP_I; + case NVPTX::SUST_B_1D_V2I8_CLAMP_R: + return NVPTX::SUST_B_1D_V2I8_CLAMP_I; + case NVPTX::SUST_B_1D_V2I16_CLAMP_R: + return NVPTX::SUST_B_1D_V2I16_CLAMP_I; + case NVPTX::SUST_B_1D_V2I32_CLAMP_R: + return NVPTX::SUST_B_1D_V2I32_CLAMP_I; + case NVPTX::SUST_B_1D_V2I64_CLAMP_R: + return NVPTX::SUST_B_1D_V2I64_CLAMP_I; + case NVPTX::SUST_B_1D_V4I8_CLAMP_R: + return NVPTX::SUST_B_1D_V4I8_CLAMP_I; + case NVPTX::SUST_B_1D_V4I16_CLAMP_R: + return NVPTX::SUST_B_1D_V4I16_CLAMP_I; + case NVPTX::SUST_B_1D_V4I32_CLAMP_R: + return NVPTX::SUST_B_1D_V4I32_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_I8_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_I8_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_I16_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_I16_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_I32_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_I32_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_I64_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_I64_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I8_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I8_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I16_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I16_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I32_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I32_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I64_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I64_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_V4I8_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_V4I8_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_V4I16_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_V4I16_CLAMP_I; + case NVPTX::SUST_B_1D_ARRAY_V4I32_CLAMP_R: + return NVPTX::SUST_B_1D_ARRAY_V4I32_CLAMP_I; + case NVPTX::SUST_B_2D_I8_CLAMP_R: + return NVPTX::SUST_B_2D_I8_CLAMP_I; + case NVPTX::SUST_B_2D_I16_CLAMP_R: + return NVPTX::SUST_B_2D_I16_CLAMP_I; + case NVPTX::SUST_B_2D_I32_CLAMP_R: + return NVPTX::SUST_B_2D_I32_CLAMP_I; + case NVPTX::SUST_B_2D_I64_CLAMP_R: + return NVPTX::SUST_B_2D_I64_CLAMP_I; + case NVPTX::SUST_B_2D_V2I8_CLAMP_R: + return NVPTX::SUST_B_2D_V2I8_CLAMP_I; + case NVPTX::SUST_B_2D_V2I16_CLAMP_R: + return NVPTX::SUST_B_2D_V2I16_CLAMP_I; + case NVPTX::SUST_B_2D_V2I32_CLAMP_R: + return NVPTX::SUST_B_2D_V2I32_CLAMP_I; + case NVPTX::SUST_B_2D_V2I64_CLAMP_R: + return NVPTX::SUST_B_2D_V2I64_CLAMP_I; + case NVPTX::SUST_B_2D_V4I8_CLAMP_R: + return NVPTX::SUST_B_2D_V4I8_CLAMP_I; + case NVPTX::SUST_B_2D_V4I16_CLAMP_R: + return NVPTX::SUST_B_2D_V4I16_CLAMP_I; + case NVPTX::SUST_B_2D_V4I32_CLAMP_R: + return NVPTX::SUST_B_2D_V4I32_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_I8_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_I8_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_I16_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_I16_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_I32_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_I32_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_I64_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_I64_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I8_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I8_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I16_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I16_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I32_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I32_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I64_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I64_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_V4I8_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_V4I8_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_V4I16_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_V4I16_CLAMP_I; + case NVPTX::SUST_B_2D_ARRAY_V4I32_CLAMP_R: + return NVPTX::SUST_B_2D_ARRAY_V4I32_CLAMP_I; + case NVPTX::SUST_B_3D_I8_CLAMP_R: + return NVPTX::SUST_B_3D_I8_CLAMP_I; + case NVPTX::SUST_B_3D_I16_CLAMP_R: + return NVPTX::SUST_B_3D_I16_CLAMP_I; + case NVPTX::SUST_B_3D_I32_CLAMP_R: + return NVPTX::SUST_B_3D_I32_CLAMP_I; + case NVPTX::SUST_B_3D_I64_CLAMP_R: + return NVPTX::SUST_B_3D_I64_CLAMP_I; + case NVPTX::SUST_B_3D_V2I8_CLAMP_R: + return NVPTX::SUST_B_3D_V2I8_CLAMP_I; + case NVPTX::SUST_B_3D_V2I16_CLAMP_R: + return NVPTX::SUST_B_3D_V2I16_CLAMP_I; + case NVPTX::SUST_B_3D_V2I32_CLAMP_R: + return NVPTX::SUST_B_3D_V2I32_CLAMP_I; + case NVPTX::SUST_B_3D_V2I64_CLAMP_R: + return NVPTX::SUST_B_3D_V2I64_CLAMP_I; + case NVPTX::SUST_B_3D_V4I8_CLAMP_R: + return NVPTX::SUST_B_3D_V4I8_CLAMP_I; + case NVPTX::SUST_B_3D_V4I16_CLAMP_R: + return NVPTX::SUST_B_3D_V4I16_CLAMP_I; + case NVPTX::SUST_B_3D_V4I32_CLAMP_R: + return NVPTX::SUST_B_3D_V4I32_CLAMP_I; + case NVPTX::SUST_B_1D_I8_TRAP_R: + return NVPTX::SUST_B_1D_I8_TRAP_I; + case NVPTX::SUST_B_1D_I16_TRAP_R: + return NVPTX::SUST_B_1D_I16_TRAP_I; + case NVPTX::SUST_B_1D_I32_TRAP_R: + return NVPTX::SUST_B_1D_I32_TRAP_I; + case NVPTX::SUST_B_1D_I64_TRAP_R: + return NVPTX::SUST_B_1D_I64_TRAP_I; + case NVPTX::SUST_B_1D_V2I8_TRAP_R: + return NVPTX::SUST_B_1D_V2I8_TRAP_I; + case NVPTX::SUST_B_1D_V2I16_TRAP_R: + return NVPTX::SUST_B_1D_V2I16_TRAP_I; + case NVPTX::SUST_B_1D_V2I32_TRAP_R: + return NVPTX::SUST_B_1D_V2I32_TRAP_I; + case NVPTX::SUST_B_1D_V2I64_TRAP_R: + return NVPTX::SUST_B_1D_V2I64_TRAP_I; + case NVPTX::SUST_B_1D_V4I8_TRAP_R: + return NVPTX::SUST_B_1D_V4I8_TRAP_I; + case NVPTX::SUST_B_1D_V4I16_TRAP_R: + return NVPTX::SUST_B_1D_V4I16_TRAP_I; + case NVPTX::SUST_B_1D_V4I32_TRAP_R: + return NVPTX::SUST_B_1D_V4I32_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_I8_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_I8_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_I16_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_I16_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_I32_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_I32_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_I64_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_I64_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I8_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I8_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I16_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I16_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I32_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I32_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_V2I64_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_V2I64_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_V4I8_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_V4I8_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_V4I16_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_V4I16_TRAP_I; + case NVPTX::SUST_B_1D_ARRAY_V4I32_TRAP_R: + return NVPTX::SUST_B_1D_ARRAY_V4I32_TRAP_I; + case NVPTX::SUST_B_2D_I8_TRAP_R: + return NVPTX::SUST_B_2D_I8_TRAP_I; + case NVPTX::SUST_B_2D_I16_TRAP_R: + return NVPTX::SUST_B_2D_I16_TRAP_I; + case NVPTX::SUST_B_2D_I32_TRAP_R: + return NVPTX::SUST_B_2D_I32_TRAP_I; + case NVPTX::SUST_B_2D_I64_TRAP_R: + return NVPTX::SUST_B_2D_I64_TRAP_I; + case NVPTX::SUST_B_2D_V2I8_TRAP_R: + return NVPTX::SUST_B_2D_V2I8_TRAP_I; + case NVPTX::SUST_B_2D_V2I16_TRAP_R: + return NVPTX::SUST_B_2D_V2I16_TRAP_I; + case NVPTX::SUST_B_2D_V2I32_TRAP_R: + return NVPTX::SUST_B_2D_V2I32_TRAP_I; + case NVPTX::SUST_B_2D_V2I64_TRAP_R: + return NVPTX::SUST_B_2D_V2I64_TRAP_I; + case NVPTX::SUST_B_2D_V4I8_TRAP_R: + return NVPTX::SUST_B_2D_V4I8_TRAP_I; + case NVPTX::SUST_B_2D_V4I16_TRAP_R: + return NVPTX::SUST_B_2D_V4I16_TRAP_I; + case NVPTX::SUST_B_2D_V4I32_TRAP_R: + return NVPTX::SUST_B_2D_V4I32_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_I8_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_I8_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_I16_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_I16_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_I32_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_I32_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_I64_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_I64_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I8_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I8_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I16_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I16_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I32_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I32_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_V2I64_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_V2I64_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_V4I8_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_V4I8_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_V4I16_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_V4I16_TRAP_I; + case NVPTX::SUST_B_2D_ARRAY_V4I32_TRAP_R: + return NVPTX::SUST_B_2D_ARRAY_V4I32_TRAP_I; + case NVPTX::SUST_B_3D_I8_TRAP_R: + return NVPTX::SUST_B_3D_I8_TRAP_I; + case NVPTX::SUST_B_3D_I16_TRAP_R: + return NVPTX::SUST_B_3D_I16_TRAP_I; + case NVPTX::SUST_B_3D_I32_TRAP_R: + return NVPTX::SUST_B_3D_I32_TRAP_I; + case NVPTX::SUST_B_3D_I64_TRAP_R: + return NVPTX::SUST_B_3D_I64_TRAP_I; + case NVPTX::SUST_B_3D_V2I8_TRAP_R: + return NVPTX::SUST_B_3D_V2I8_TRAP_I; + case NVPTX::SUST_B_3D_V2I16_TRAP_R: + return NVPTX::SUST_B_3D_V2I16_TRAP_I; + case NVPTX::SUST_B_3D_V2I32_TRAP_R: + return NVPTX::SUST_B_3D_V2I32_TRAP_I; + case NVPTX::SUST_B_3D_V2I64_TRAP_R: + return NVPTX::SUST_B_3D_V2I64_TRAP_I; + case NVPTX::SUST_B_3D_V4I8_TRAP_R: + return NVPTX::SUST_B_3D_V4I8_TRAP_I; + case NVPTX::SUST_B_3D_V4I16_TRAP_R: + return NVPTX::SUST_B_3D_V4I16_TRAP_I; + case NVPTX::SUST_B_3D_V4I32_TRAP_R: + return NVPTX::SUST_B_3D_V4I32_TRAP_I; + case NVPTX::SUST_B_1D_I8_ZERO_R: + return NVPTX::SUST_B_1D_I8_ZERO_I; + case NVPTX::SUST_B_1D_I16_ZERO_R: + return NVPTX::SUST_B_1D_I16_ZERO_I; + case NVPTX::SUST_B_1D_I32_ZERO_R: + return NVPTX::SUST_B_1D_I32_ZERO_I; + case NVPTX::SUST_B_1D_I64_ZERO_R: + return NVPTX::SUST_B_1D_I64_ZERO_I; + case NVPTX::SUST_B_1D_V2I8_ZERO_R: + return NVPTX::SUST_B_1D_V2I8_ZERO_I; + case NVPTX::SUST_B_1D_V2I16_ZERO_R: + return NVPTX::SUST_B_1D_V2I16_ZERO_I; + case NVPTX::SUST_B_1D_V2I32_ZERO_R: + return NVPTX::SUST_B_1D_V2I32_ZERO_I; + case NVPTX::SUST_B_1D_V2I64_ZERO_R: + return NVPTX::SUST_B_1D_V2I64_ZERO_I; + case NVPTX::SUST_B_1D_V4I8_ZERO_R: + return NVPTX::SUST_B_1D_V4I8_ZERO_I; + case NVPTX::SUST_B_1D_V4I16_ZERO_R: + return NVPTX::SUST_B_1D_V4I16_ZERO_I; + case NVPTX::SUST_B_1D_V4I32_ZERO_R: + return NVPTX::SUST_B_1D_V4I32_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_I8_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_I8_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_I16_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_I16_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_I32_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_I32_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_I64_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_I64_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_V2I8_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_V2I8_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_V2I16_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_V2I16_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_V2I32_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_V2I32_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_V2I64_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_V2I64_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_V4I8_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_V4I8_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_V4I16_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_V4I16_ZERO_I; + case NVPTX::SUST_B_1D_ARRAY_V4I32_ZERO_R: + return NVPTX::SUST_B_1D_ARRAY_V4I32_ZERO_I; + case NVPTX::SUST_B_2D_I8_ZERO_R: + return NVPTX::SUST_B_2D_I8_ZERO_I; + case NVPTX::SUST_B_2D_I16_ZERO_R: + return NVPTX::SUST_B_2D_I16_ZERO_I; + case NVPTX::SUST_B_2D_I32_ZERO_R: + return NVPTX::SUST_B_2D_I32_ZERO_I; + case NVPTX::SUST_B_2D_I64_ZERO_R: + return NVPTX::SUST_B_2D_I64_ZERO_I; + case NVPTX::SUST_B_2D_V2I8_ZERO_R: + return NVPTX::SUST_B_2D_V2I8_ZERO_I; + case NVPTX::SUST_B_2D_V2I16_ZERO_R: + return NVPTX::SUST_B_2D_V2I16_ZERO_I; + case NVPTX::SUST_B_2D_V2I32_ZERO_R: + return NVPTX::SUST_B_2D_V2I32_ZERO_I; + case NVPTX::SUST_B_2D_V2I64_ZERO_R: + return NVPTX::SUST_B_2D_V2I64_ZERO_I; + case NVPTX::SUST_B_2D_V4I8_ZERO_R: + return NVPTX::SUST_B_2D_V4I8_ZERO_I; + case NVPTX::SUST_B_2D_V4I16_ZERO_R: + return NVPTX::SUST_B_2D_V4I16_ZERO_I; + case NVPTX::SUST_B_2D_V4I32_ZERO_R: + return NVPTX::SUST_B_2D_V4I32_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_I8_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_I8_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_I16_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_I16_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_I32_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_I32_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_I64_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_I64_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_V2I8_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_V2I8_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_V2I16_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_V2I16_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_V2I32_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_V2I32_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_V2I64_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_V2I64_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_V4I8_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_V4I8_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_V4I16_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_V4I16_ZERO_I; + case NVPTX::SUST_B_2D_ARRAY_V4I32_ZERO_R: + return NVPTX::SUST_B_2D_ARRAY_V4I32_ZERO_I; + case NVPTX::SUST_B_3D_I8_ZERO_R: + return NVPTX::SUST_B_3D_I8_ZERO_I; + case NVPTX::SUST_B_3D_I16_ZERO_R: + return NVPTX::SUST_B_3D_I16_ZERO_I; + case NVPTX::SUST_B_3D_I32_ZERO_R: + return NVPTX::SUST_B_3D_I32_ZERO_I; + case NVPTX::SUST_B_3D_I64_ZERO_R: + return NVPTX::SUST_B_3D_I64_ZERO_I; + case NVPTX::SUST_B_3D_V2I8_ZERO_R: + return NVPTX::SUST_B_3D_V2I8_ZERO_I; + case NVPTX::SUST_B_3D_V2I16_ZERO_R: + return NVPTX::SUST_B_3D_V2I16_ZERO_I; + case NVPTX::SUST_B_3D_V2I32_ZERO_R: + return NVPTX::SUST_B_3D_V2I32_ZERO_I; + case NVPTX::SUST_B_3D_V2I64_ZERO_R: + return NVPTX::SUST_B_3D_V2I64_ZERO_I; + case NVPTX::SUST_B_3D_V4I8_ZERO_R: + return NVPTX::SUST_B_3D_V4I8_ZERO_I; + case NVPTX::SUST_B_3D_V4I16_ZERO_R: + return NVPTX::SUST_B_3D_V4I16_ZERO_I; + case NVPTX::SUST_B_3D_V4I32_ZERO_R: + return NVPTX::SUST_B_3D_V4I32_ZERO_I; + case NVPTX::SUST_P_1D_I8_TRAP_R: + return NVPTX::SUST_P_1D_I8_TRAP_I; + case NVPTX::SUST_P_1D_I16_TRAP_R: + return NVPTX::SUST_P_1D_I16_TRAP_I; + case NVPTX::SUST_P_1D_I32_TRAP_R: + return NVPTX::SUST_P_1D_I32_TRAP_I; + case NVPTX::SUST_P_1D_V2I8_TRAP_R: + return NVPTX::SUST_P_1D_V2I8_TRAP_I; + case NVPTX::SUST_P_1D_V2I16_TRAP_R: + return NVPTX::SUST_P_1D_V2I16_TRAP_I; + case NVPTX::SUST_P_1D_V2I32_TRAP_R: + return NVPTX::SUST_P_1D_V2I32_TRAP_I; + case NVPTX::SUST_P_1D_V4I8_TRAP_R: + return NVPTX::SUST_P_1D_V4I8_TRAP_I; + case NVPTX::SUST_P_1D_V4I16_TRAP_R: + return NVPTX::SUST_P_1D_V4I16_TRAP_I; + case NVPTX::SUST_P_1D_V4I32_TRAP_R: + return NVPTX::SUST_P_1D_V4I32_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_I8_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_I8_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_I16_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_I16_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_I32_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_I32_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_V2I8_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_V2I8_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_V2I16_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_V2I16_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_V2I32_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_V2I32_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_V4I8_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_V4I8_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_V4I16_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_V4I16_TRAP_I; + case NVPTX::SUST_P_1D_ARRAY_V4I32_TRAP_R: + return NVPTX::SUST_P_1D_ARRAY_V4I32_TRAP_I; + case NVPTX::SUST_P_2D_I8_TRAP_R: + return NVPTX::SUST_P_2D_I8_TRAP_I; + case NVPTX::SUST_P_2D_I16_TRAP_R: + return NVPTX::SUST_P_2D_I16_TRAP_I; + case NVPTX::SUST_P_2D_I32_TRAP_R: + return NVPTX::SUST_P_2D_I32_TRAP_I; + case NVPTX::SUST_P_2D_V2I8_TRAP_R: + return NVPTX::SUST_P_2D_V2I8_TRAP_I; + case NVPTX::SUST_P_2D_V2I16_TRAP_R: + return NVPTX::SUST_P_2D_V2I16_TRAP_I; + case NVPTX::SUST_P_2D_V2I32_TRAP_R: + return NVPTX::SUST_P_2D_V2I32_TRAP_I; + case NVPTX::SUST_P_2D_V4I8_TRAP_R: + return NVPTX::SUST_P_2D_V4I8_TRAP_I; + case NVPTX::SUST_P_2D_V4I16_TRAP_R: + return NVPTX::SUST_P_2D_V4I16_TRAP_I; + case NVPTX::SUST_P_2D_V4I32_TRAP_R: + return NVPTX::SUST_P_2D_V4I32_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_I8_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_I8_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_I16_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_I16_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_I32_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_I32_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_V2I8_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_V2I8_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_V2I16_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_V2I16_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_V2I32_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_V2I32_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_V4I8_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_V4I8_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_V4I16_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_V4I16_TRAP_I; + case NVPTX::SUST_P_2D_ARRAY_V4I32_TRAP_R: + return NVPTX::SUST_P_2D_ARRAY_V4I32_TRAP_I; + case NVPTX::SUST_P_3D_I8_TRAP_R: + return NVPTX::SUST_P_3D_I8_TRAP_I; + case NVPTX::SUST_P_3D_I16_TRAP_R: + return NVPTX::SUST_P_3D_I16_TRAP_I; + case NVPTX::SUST_P_3D_I32_TRAP_R: + return NVPTX::SUST_P_3D_I32_TRAP_I; + case NVPTX::SUST_P_3D_V2I8_TRAP_R: + return NVPTX::SUST_P_3D_V2I8_TRAP_I; + case NVPTX::SUST_P_3D_V2I16_TRAP_R: + return NVPTX::SUST_P_3D_V2I16_TRAP_I; + case NVPTX::SUST_P_3D_V2I32_TRAP_R: + return NVPTX::SUST_P_3D_V2I32_TRAP_I; + case NVPTX::SUST_P_3D_V4I8_TRAP_R: + return NVPTX::SUST_P_3D_V4I8_TRAP_I; + case NVPTX::SUST_P_3D_V4I16_TRAP_R: + return NVPTX::SUST_P_3D_V4I16_TRAP_I; + case NVPTX::SUST_P_3D_V4I32_TRAP_R: + return NVPTX::SUST_P_3D_V4I32_TRAP_I; default: llvm_unreachable("Unhandled SUST opcode"); } From ace356bc9777e6a5b5aa0ba2335d2546ac6f330e Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 11 Jun 2025 20:45:32 +0100 Subject: [PATCH 133/851] [VPlan] Always verify VPCanonicalIVPHIRecipe placement (NFC). Loop regions are dissolved since dcef154b5caf6556e69bb1, remove the check for VerifyLate and corresponding TODO. --- llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp index 45010d0021581..fba4a68f4a27b 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp @@ -429,8 +429,7 @@ bool VPlanVerifier::verify(const VPlan &Plan) { return false; } - // TODO: Remove once loop regions are dissolved before execution. - if (!VerifyLate && !isa(&*Entry->begin())) { + if (!isa(&*Entry->begin())) { errs() << "VPlan vector loop header does not start with a " "VPCanonicalIVPHIRecipe\n"; return false; From ebc90d50b88a7c46634ea21e40ddb25c679ac874 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 12:50:03 -0700 Subject: [PATCH 134/851] [SandboxVectorizer] Use llvm::find (NFC) (#143724) llvm::find allows us to pass a range. --- .../llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h index d4cb34647cf55..6d2144b14bb00 100644 --- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h +++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h @@ -68,7 +68,7 @@ class SeedBundle { /// the seeds in a bundle. This allows constant time evaluation /// and "removal" from the list. void setUsed(Instruction *I) { - auto It = std::find(begin(), end(), I); + auto It = llvm::find(*this, I); assert(It != end() && "Instruction not in the bundle!"); auto Idx = It - begin(); setUsed(Idx, 1, /*VerifyUnused=*/false); From e266d6a5da6871c89747416c70a4a39181b594fb Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 12:50:11 -0700 Subject: [PATCH 135/851] [Format] Use llvm::min_element (NFC) (#143725) llvm::min_elements allows us to pass a range. --- clang/lib/Format/MacroCallReconstructor.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/clang/lib/Format/MacroCallReconstructor.cpp b/clang/lib/Format/MacroCallReconstructor.cpp index 116bbad320e1f..895d9f93dfce3 100644 --- a/clang/lib/Format/MacroCallReconstructor.cpp +++ b/clang/lib/Format/MacroCallReconstructor.cpp @@ -528,10 +528,10 @@ MacroCallReconstructor::createUnwrappedLine(const ReconstructedLine &Line, // 1. One level below the current line's level. // 2. At the correct level relative to each other. unsigned MinChildLevel = - std::min_element(N->Children.begin(), N->Children.end(), - [](const auto &E1, const auto &E2) { - return E1->Level < E2->Level; - }) + llvm::min_element(N->Children, + [](const auto &E1, const auto &E2) { + return E1->Level < E2->Level; + }) ->get() ->Level; for (const auto &Child : N->Children) { From c1d21f44340901f6a23ae7eb7c5379f5ad197b27 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 12:50:19 -0700 Subject: [PATCH 136/851] [lld] Use std::tie to implement comparison operators (NFC) (#143726) std::tie facilitates lexicographical comparisons through std::tuple's built-in operator< and operator>. --- lld/ELF/SyntheticSections.cpp | 7 ++----- lld/MachO/UnwindInfoSection.cpp | 8 +++----- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp index 785a56cdb349e..0a9c7a081eb8b 100644 --- a/lld/ELF/SyntheticSections.cpp +++ b/lld/ELF/SyntheticSections.cpp @@ -1939,11 +1939,8 @@ bool AndroidPackedRelocationSection::updateAllocSize(Ctx &ctx) { // For Rela, we also want to sort by r_addend when r_info is the same. This // enables us to group by r_addend as well. llvm::sort(nonRelatives, [](const Elf_Rela &a, const Elf_Rela &b) { - if (a.r_info != b.r_info) - return a.r_info < b.r_info; - if (a.r_addend != b.r_addend) - return a.r_addend < b.r_addend; - return a.r_offset < b.r_offset; + return std::tie(a.r_info, a.r_addend, a.r_offset) < + std::tie(b.r_info, b.r_addend, b.r_offset); }); // Group relocations with the same r_info. Note that each group emits a group diff --git a/lld/MachO/UnwindInfoSection.cpp b/lld/MachO/UnwindInfoSection.cpp index 624464e41d77c..6e9f6c2aba749 100644 --- a/lld/MachO/UnwindInfoSection.cpp +++ b/lld/MachO/UnwindInfoSection.cpp @@ -535,11 +535,9 @@ void UnwindInfoSectionImpl::finalize() { llvm::sort(commonEncodings, [](const std::pair &a, const std::pair &b) { - if (a.second == b.second) - // When frequencies match, secondarily sort on encoding - // to maintain parity with validate-unwind-info.py - return a.first > b.first; - return a.second > b.second; + // When frequencies match, secondarily sort on encoding + // to maintain parity with validate-unwind-info.py + return std::tie(a.second, a.first) > std::tie(b.second, b.first); }); // Truncate the vector to 127 elements. From 8da1ac98efa0d315824a92d8b563299eccc3e0f1 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 12:50:27 -0700 Subject: [PATCH 137/851] [llvm] Use std::tie to implement operator< (NFC) (#143728) std::tie facilitates lexicographical comparisons through std::tuple's built-in operator<. --- .../ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h | 12 +++--------- llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp | 8 ++------ 2 files changed, 5 insertions(+), 15 deletions(-) diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h index 24b03a058981a..89b20978c40e6 100644 --- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h +++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h @@ -202,15 +202,9 @@ class RelocationValueRef { IsStubThumb == Other.IsStubThumb; } inline bool operator<(const RelocationValueRef &Other) const { - if (SectionID != Other.SectionID) - return SectionID < Other.SectionID; - if (Offset != Other.Offset) - return Offset < Other.Offset; - if (Addend != Other.Addend) - return Addend < Other.Addend; - if (IsStubThumb != Other.IsStubThumb) - return IsStubThumb < Other.IsStubThumb; - return SymbolName < Other.SymbolName; + return std::tie(SectionID, Offset, Addend, IsStubThumb, SymbolName) < + std::tie(Other.SectionID, Other.Offset, Other.Addend, + Other.IsStubThumb, Other.SymbolName); } }; diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp index f38e7b879e5f0..5dde47ab3de57 100644 --- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp +++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp @@ -253,7 +253,7 @@ namespace { bool operator!=(Register R) const { return !operator==(R); } bool operator<(Register R) const { // For std::map. - return Reg < R.Reg || (Reg == R.Reg && Sub < R.Sub); + return std::tie(Reg, Sub) < std::tie(R.Reg, R.Sub); } llvm::Register Reg; unsigned Sub = 0; @@ -298,11 +298,7 @@ namespace { return !operator==(Ex); } bool operator<(const ExtExpr &Ex) const { - if (Rs != Ex.Rs) - return Rs < Ex.Rs; - if (S != Ex.S) - return S < Ex.S; - return !Neg && Ex.Neg; + return std::tie(Rs, S, Neg) < std::tie(Ex.Rs, Ex.S, Ex.Neg); } }; From 43c35e858ccae05d69151ccf9712a725aae37b52 Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 12:50:35 -0700 Subject: [PATCH 138/851] [mlir] Simplify calls to *Map::{insert,try_emplace} (NFC) (#143729) This patch simplifies code by removing the values from insert/try_emplace. Note that default values inserted by try_emplace are immediately overrideen in all these cases. --- mlir/lib/IR/AsmPrinter.cpp | 3 +-- mlir/lib/IR/SymbolTable.cpp | 2 +- mlir/lib/Transforms/Utils/CFGToSCF.cpp | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/mlir/lib/IR/AsmPrinter.cpp b/mlir/lib/IR/AsmPrinter.cpp index fc1806900c0aa..c7cc6a02ad208 100644 --- a/mlir/lib/IR/AsmPrinter.cpp +++ b/mlir/lib/IR/AsmPrinter.cpp @@ -1146,8 +1146,7 @@ template std::pair AliasInitializer::visitImpl( T value, llvm::MapVector &aliases, bool canBeDeferred, PrintArgs &&...printArgs) { - auto [it, inserted] = - aliases.insert({value.getAsOpaquePointer(), InProgressAliasInfo()}); + auto [it, inserted] = aliases.try_emplace(value.getAsOpaquePointer()); size_t aliasIndex = std::distance(aliases.begin(), it); if (!inserted) { // Make sure that the alias isn't deferred if we don't permit it. diff --git a/mlir/lib/IR/SymbolTable.cpp b/mlir/lib/IR/SymbolTable.cpp index 075a0ba15d7cd..aaa4d5617eb4f 100644 --- a/mlir/lib/IR/SymbolTable.cpp +++ b/mlir/lib/IR/SymbolTable.cpp @@ -1100,7 +1100,7 @@ void SymbolUserMap::replaceAllUsesWith(Operation *symbol, if (newSymbol != symbol) { // Transfer over the users to the new symbol. The reference to the old one // is fetched again as the iterator is invalidated during the insertion. - auto newIt = symbolToUsers.try_emplace(newSymbol, SetVector{}); + auto newIt = symbolToUsers.try_emplace(newSymbol); auto oldIt = symbolToUsers.find(symbol); assert(oldIt != symbolToUsers.end() && "missing old users list"); if (newIt.second) diff --git a/mlir/lib/Transforms/Utils/CFGToSCF.cpp b/mlir/lib/Transforms/Utils/CFGToSCF.cpp index de380fc325f55..7c1781044d2a2 100644 --- a/mlir/lib/Transforms/Utils/CFGToSCF.cpp +++ b/mlir/lib/Transforms/Utils/CFGToSCF.cpp @@ -709,7 +709,7 @@ transformToReduceLoop(Block *loopHeader, Block *exitBlock, llvm::SmallDenseMap dominanceCache; // Returns true if `loopBlock` dominates `block`. auto loopBlockDominates = [&](Block *block) { - auto [iter, inserted] = dominanceCache.insert({block, false}); + auto [iter, inserted] = dominanceCache.try_emplace(block); if (!inserted) return iter->second; iter->second = dominanceInfo.dominates(loopBlock, block); From ad2a2b8eed2f3ed1e050833ea8a8d88b0878c6a7 Mon Sep 17 00:00:00 2001 From: Paul Kirth Date: Wed, 11 Jun 2025 13:05:21 -0700 Subject: [PATCH 139/851] [llvm] Add a tool to check mustache compliance against the public spec (#142813) This is a cli tool to that tests the conformance of LLVM's mustache implementation against the public Mustache spec, hosted at https://github.com/mustache/spec. This is a revised version of the patches in #111487. Co-authored-by: Peter Chou --- llvm/CMakeLists.txt | 1 + llvm/docs/CommandGuide/index.rst | 1 + .../CommandGuide/llvm-test-mustache-spec.rst | 37 +++ .../llvm-test-mustache-spec/CMakeLists.txt | 5 + .../llvm-test-mustache-spec.cpp | 268 ++++++++++++++++++ 5 files changed, 312 insertions(+) create mode 100644 llvm/docs/CommandGuide/llvm-test-mustache-spec.rst create mode 100644 llvm/utils/llvm-test-mustache-spec/CMakeLists.txt create mode 100644 llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt index 206f009b45f59..cfb67472aa71e 100644 --- a/llvm/CMakeLists.txt +++ b/llvm/CMakeLists.txt @@ -1313,6 +1313,7 @@ if( LLVM_INCLUDE_UTILS ) add_subdirectory(utils/yaml-bench) add_subdirectory(utils/split-file) add_subdirectory(utils/mlgo-utils) + add_subdirectory(utils/llvm-test-mustache-spec) if( LLVM_INCLUDE_TESTS ) set(LLVM_SUBPROJECT_TITLE "Third-Party/Google Test") add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest) diff --git a/llvm/docs/CommandGuide/index.rst b/llvm/docs/CommandGuide/index.rst index 643951eca2a26..88fc1fd326b76 100644 --- a/llvm/docs/CommandGuide/index.rst +++ b/llvm/docs/CommandGuide/index.rst @@ -87,6 +87,7 @@ Developer Tools llvm-exegesis llvm-ifs llvm-locstats + llvm-test-mustache-spec llvm-pdbutil llvm-profgen llvm-tli-checker diff --git a/llvm/docs/CommandGuide/llvm-test-mustache-spec.rst b/llvm/docs/CommandGuide/llvm-test-mustache-spec.rst new file mode 100644 index 0000000000000..8cd5a349e7e49 --- /dev/null +++ b/llvm/docs/CommandGuide/llvm-test-mustache-spec.rst @@ -0,0 +1,37 @@ +llvm-test-mustache-spec - LLVM tool to test Mustache library compliance +======================================================================= + +.. program:: llvm-test-mustache-spec + +SYNOPSIS +-------- + +:program:`llvm-test-mustache-spec` [*inputs...*] + +Description +----------- + +``llvm-test-mustache-spec`` tests the mustache spec conformance of the LLVM +mustache library. The spec can be found here: https://github.com/mustache/spec + +To test against the spec, simply download the spec and pass the test JSON files +to the driver. Each spec file should have a list of tests for compliance with +the spec. These are loaded as test cases, and rendered with our Mustache +implementation, which is then compared against the expected output from the +spec. + +The current implementation only supports non-optional parts of the spec, so +we do not expect any of the dynamic-names, inheritance, or lambda tests to +pass. Additionally, Triple Mustache is not supported. Unsupported tests are +marked as XFail and are removed from the XFail list as they are fixed. + +The tool prints the number of test failures and successes in each of the test +files to standard output. + +EXAMPLE +------- + +.. code-block:: console + + $ llvm-test-mustache-spec path/to/specs/\*.json + diff --git a/llvm/utils/llvm-test-mustache-spec/CMakeLists.txt b/llvm/utils/llvm-test-mustache-spec/CMakeLists.txt new file mode 100644 index 0000000000000..dc1aa73371ffc --- /dev/null +++ b/llvm/utils/llvm-test-mustache-spec/CMakeLists.txt @@ -0,0 +1,5 @@ +add_llvm_utility(llvm-test-mustache-spec + llvm-test-mustache-spec.cpp +) + +target_link_libraries(llvm-test-mustache-spec PRIVATE LLVMSupport) diff --git a/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp new file mode 100644 index 0000000000000..28ed1b876672d --- /dev/null +++ b/llvm/utils/llvm-test-mustache-spec/llvm-test-mustache-spec.cpp @@ -0,0 +1,268 @@ +//===----------------------------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Simple drivers to test the mustache spec found at: +// https://github.com/mustache/spec +// +// It is used to verify that the current implementation conforms to the spec. +// Simply download the spec and pass the test JSON files to the driver. Each +// spec file should have a list of tests for compliance with the spec. These +// are loaded as test cases, and rendered with our Mustache implementation, +// which is then compared against the expected output from the spec. +// +// The current implementation only supports non-optional parts of the spec, so +// we do not expect any of the dynamic-names, inheritance, or lambda tests to +// pass. Additionally, Triple Mustache is not supported. Unsupported tests are +// marked as XFail and are removed from the XFail list as they are fixed. +// +// Usage: +// llvm-test-mustache-spec path/to/test/file.json path/to/test/file2.json ... +//===----------------------------------------------------------------------===// + +#include "llvm/ADT/StringSet.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/Mustache.h" +#include "llvm/Support/Path.h" +#include + +using namespace llvm; +using namespace llvm::json; +using namespace llvm::mustache; + +#define DEBUG_TYPE "llvm-test-mustache-spec" + +static cl::OptionCategory Cat("llvm-test-mustache-spec Options"); + +static cl::list + InputFiles(cl::Positional, cl::desc(""), cl::OneOrMore); + +static cl::opt ReportErrors("report-errors", + cl::desc("Report errors in spec tests"), + cl::cat(Cat)); + +static ExitOnError ExitOnErr; + +static int NumXFail = 0; +static int NumSuccess = 0; + +static const StringMap> XFailTestNames = {{ + {"delimiters.json", + { + "Pair Behavior", + "Special Characters", + "Sections", + "Inverted Sections", + "Partial Inheritence", + "Post-Partial Behavior", + "Standalone Tag", + "Indented Standalone Tag", + "Standalone Line Endings", + "Standalone Without Previous Line", + "Standalone Without Newline", + }}, + {"~dynamic-names.json", + { + "Basic Behavior - Partial", + "Basic Behavior - Name Resolution", + "Context", + "Dotted Names", + "Dotted Names - Failed Lookup", + "Dotted names - Context Stacking", + "Dotted names - Context Stacking Under Repetition", + "Dotted names - Context Stacking Failed Lookup", + "Recursion", + "Surrounding Whitespace", + "Inline Indentation", + "Standalone Line Endings", + "Standalone Without Previous Line", + "Standalone Without Newline", + "Standalone Indentation", + "Padding Whitespace", + }}, + {"~inheritance.json", + { + "Default", + "Variable", + "Triple Mustache", + "Sections", + "Negative Sections", + "Mustache Injection", + "Inherit", + "Overridden content", + "Data does not override block default", + "Two overridden parents", + "Override parent with newlines", + "Inherit indentation", + "Only one override", + "Parent template", + "Recursion", + "Multi-level inheritance, no sub child", + "Text inside parent", + "Text inside parent", + "Block scope", + "Standalone parent", + "Standalone block", + "Block reindentation", + "Intrinsic indentation", + "Nested block reindentation", + + }}, + {"~lambdas.json", + { + "Interpolation", + "Interpolation - Expansion", + "Interpolation - Alternate Delimiters", + "Interpolation - Multiple Calls", + "Escaping", + "Section", + "Section - Expansion", + "Section - Alternate Delimiters", + "Section - Multiple Calls", + + }}, + {"interpolation.json", + { + "Triple Mustache", + "Triple Mustache Integer Interpolation", + "Triple Mustache Decimal Interpolation", + "Triple Mustache Null Interpolation", + "Triple Mustache Context Miss Interpolation", + "Dotted Names - Triple Mustache Interpolation", + "Implicit Iterators - Triple Mustache", + "Triple Mustache - Surrounding Whitespace", + "Triple Mustache - Standalone", + "Triple Mustache With Padding", + }}, + {"partials.json", {"Standalone Indentation"}}, + {"sections.json", {"Implicit Iterator - Triple mustache"}}, +}}; + +struct TestData { + static Expected createTestData(json::Object *TestCase, + StringRef InputFile) { + // If any of the needed elements are missing, we cannot continue. + // NOTE: partials are optional in the test schema. + if (!TestCase || !TestCase->getString("template") || + !TestCase->getString("expected") || !TestCase->getString("name") || + !TestCase->get("data")) + return createStringError( + llvm::inconvertibleErrorCode(), + "invalid JSON schema in test file: " + InputFile + "\n"); + + return TestData{TestCase->getString("template").value(), + TestCase->getString("expected").value(), + TestCase->getString("name").value(), TestCase->get("data"), + TestCase->get("partials")}; + } + + TestData() = default; + + StringRef TemplateStr; + StringRef ExpectedStr; + StringRef Name; + Value *Data; + Value *Partials; +}; + +static void reportTestFailure(const TestData &TD, StringRef ActualStr, + bool IsXFail) { + LLVM_DEBUG(dbgs() << "Template: " << TD.TemplateStr << "\n"); + if (TD.Partials) { + LLVM_DEBUG(dbgs() << "Partial: "); + LLVM_DEBUG(TD.Partials->print(dbgs())); + LLVM_DEBUG(dbgs() << "\n"); + } + LLVM_DEBUG(dbgs() << "JSON Data: "); + LLVM_DEBUG(TD.Data->print(dbgs())); + LLVM_DEBUG(dbgs() << "\n"); + outs() << formatv("Test {}: {}\n", (IsXFail ? "XFailed" : "Failed"), TD.Name); + if (ReportErrors) { + outs() << " Expected: \'" << TD.ExpectedStr << "\'\n" + << " Actual: \'" << ActualStr << "\'\n" + << " ====================\n"; + } +} + +static void registerPartials(Value *Partials, Template &T) { + if (!Partials) + return; + for (const auto &[Partial, Str] : *Partials->getAsObject()) + T.registerPartial(Partial.str(), Str.getAsString()->str()); +} + +static json::Value readJsonFromFile(StringRef &InputFile) { + std::unique_ptr Buffer = + ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(InputFile))); + return ExitOnErr(parse(Buffer->getBuffer())); +} + +static bool isTestXFail(StringRef FileName, StringRef TestName) { + auto P = llvm::sys::path::filename(FileName); + auto It = XFailTestNames.find(P); + return It != XFailTestNames.end() && It->second.contains(TestName); +} + +static bool evaluateTest(StringRef &InputFile, TestData &TestData, + std::string &ActualStr) { + bool IsXFail = isTestXFail(InputFile, TestData.Name); + bool Matches = TestData.ExpectedStr == ActualStr; + if ((Matches && IsXFail) || (!Matches && !IsXFail)) { + reportTestFailure(TestData, ActualStr, IsXFail); + return false; + } + IsXFail ? NumXFail++ : NumSuccess++; + return true; +} + +static void runTest(StringRef InputFile) { + NumXFail = 0; + NumSuccess = 0; + outs() << "Running Tests: " << InputFile << "\n"; + json::Value Json = readJsonFromFile(InputFile); + + json::Object *Obj = Json.getAsObject(); + Array *TestArray = Obj->getArray("tests"); + // Even though we parsed the JSON, it can have a bad format, so check it. + if (!TestArray) + ExitOnErr(createStringError( + llvm::inconvertibleErrorCode(), + "invalid JSON schema in test file: " + InputFile + "\n")); + + const size_t Total = TestArray->size(); + + for (Value V : *TestArray) { + auto TestData = + ExitOnErr(TestData::createTestData(V.getAsObject(), InputFile)); + Template T(TestData.TemplateStr); + registerPartials(TestData.Partials, T); + + std::string ActualStr; + raw_string_ostream OS(ActualStr); + T.render(*TestData.Data, OS); + evaluateTest(InputFile, TestData, ActualStr); + } + + const int NumFailed = Total - NumSuccess - NumXFail; + outs() << formatv("===Results===\n" + " Suceeded: {}\n" + " Expectedly Failed: {}\n" + " Failed: {}\n" + " Total: {}\n", + NumSuccess, NumXFail, NumFailed, Total); +} + +int main(int argc, char **argv) { + ExitOnErr.setBanner(std::string(argv[0]) + " error: "); + cl::ParseCommandLineOptions(argc, argv); + for (const auto &FileName : InputFiles) + runTest(FileName); + return 0; +} From e7e491f6ee2baee4e2ab2947e1c64bc54e3ebbec Mon Sep 17 00:00:00 2001 From: Craig Topper Date: Wed, 11 Jun 2025 13:06:22 -0700 Subject: [PATCH 140/851] [SelectionDAG] Add ISD::VSELECT to SelectionDAG::canCreateUndefOrPoison. (#143760) --- .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 1 + .../RISCV/rvv/combine-reduce-add-to-vcpop.ll | 69 +++++++++---------- .../CodeGen/RISCV/rvv/vector-interleave.ll | 16 ++--- .../test/CodeGen/X86/avx10_2_512bf16-arith.ll | 2 +- llvm/test/CodeGen/X86/avx10_2bf16-arith.ll | 4 +- 5 files changed, 46 insertions(+), 46 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index 4fc026ca562ba..45a37622a531b 100644 --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5553,6 +5553,7 @@ bool SelectionDAG::canCreateUndefOrPoison(SDValue Op, const APInt &DemandedElts, case ISD::BUILD_VECTOR: case ISD::BUILD_PAIR: case ISD::SPLAT_VECTOR: + case ISD::VSELECT: return false; case ISD::SELECT_CC: diff --git a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll index 88894f887cc20..5dc532273b770 100644 --- a/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll +++ b/llvm/test/CodeGen/RISCV/rvv/combine-reduce-add-to-vcpop.ll @@ -313,12 +313,12 @@ define i32 @test_nxv128i1( %x) { ; CHECK-NEXT: vslidedown.vx v0, v6, a0 ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma ; CHECK-NEXT: vslidedown.vx v6, v7, a1 +; CHECK-NEXT: vsetvli a1, zero, e32, m8, ta, ma +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: vsetvli a1, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v7, a0 ; CHECK-NEXT: vslidedown.vx v5, v6, a0 -; CHECK-NEXT: vslidedown.vx v4, v7, a0 ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vmv1r.v v0, v4 ; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t ; CHECK-NEXT: vmv1r.v v0, v5 ; CHECK-NEXT: vadd.vi v16, v16, 1, v0.t @@ -364,9 +364,9 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: vmv1r.v v7, v9 ; CHECK-NEXT: vmv1r.v v5, v8 ; CHECK-NEXT: vmv1r.v v4, v0 -; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vmv.v.i v24, 0 ; CHECK-NEXT: csrr a1, vlenb -; CHECK-NEXT: vmerge.vim v8, v16, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: mv a2, a0 @@ -376,7 +376,7 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vs8r.v v8, (a0) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vmv1r.v v0, v5 -; CHECK-NEXT: vmerge.vim v8, v16, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 5 ; CHECK-NEXT: add a0, sp, a0 @@ -388,9 +388,8 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: vslidedown.vx v3, v4, a0 ; CHECK-NEXT: vslidedown.vx v2, v5, a0 ; CHECK-NEXT: vmv.v.v v0, v3 -; CHECK-NEXT: vmv8r.v v8, v16 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: mv a3, a2 @@ -398,42 +397,43 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: add a2, a2, a3 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vmv1r.v v0, v2 -; CHECK-NEXT: vmerge.vim v16, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 4 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v3, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v16, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 ; CHECK-NEXT: csrr a2, vlenb ; CHECK-NEXT: slli a2, a2, 3 ; CHECK-NEXT: add a2, sp, a2 ; CHECK-NEXT: addi a2, a2, 16 -; CHECK-NEXT: vs8r.v v16, (a2) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v2, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v24, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v16, v24, 1, v0 ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v4, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma -; CHECK-NEXT: vmerge.vim v16, v8, 1, v0 +; CHECK-NEXT: vmerge.vim v8, v24, 1, v0 ; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma ; CHECK-NEXT: vslidedown.vx v0, v5, a1 +; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, ma +; CHECK-NEXT: vmerge.vim v24, v24, 1, v0 +; CHECK-NEXT: vsetvli a2, zero, e8, mf2, ta, ma +; CHECK-NEXT: vslidedown.vx v0, v6, a1 ; CHECK-NEXT: vslidedown.vx v5, v7, a1 -; CHECK-NEXT: vslidedown.vx v4, v6, a1 ; CHECK-NEXT: vsetvli a2, zero, e32, m8, ta, mu -; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 -; CHECK-NEXT: vmv1r.v v0, v4 -; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t +; CHECK-NEXT: vadd.vi v24, v24, 1, v0.t ; CHECK-NEXT: vmv1r.v v0, v5 -; CHECK-NEXT: vadd.vi v16, v16, 1, v0.t -; CHECK-NEXT: vadd.vv v8, v16, v8 +; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t +; CHECK-NEXT: vadd.vv v8, v8, v24 ; CHECK-NEXT: addi a2, sp, 16 ; CHECK-NEXT: vs8r.v v8, (a2) # vscale x 64-byte Folded Spill ; CHECK-NEXT: vsetvli a2, zero, e8, m1, ta, ma @@ -443,7 +443,7 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: vslidedown.vx v0, v4, a1 ; CHECK-NEXT: vslidedown.vx v3, v5, a1 ; CHECK-NEXT: vsetvli a0, zero, e32, m8, ta, mu -; CHECK-NEXT: vadd.vi v24, v24, 1, v0.t +; CHECK-NEXT: vadd.vi v16, v16, 1, v0.t ; CHECK-NEXT: vmv1r.v v0, v3 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 @@ -451,7 +451,7 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t -; CHECK-NEXT: vadd.vv v8, v8, v24 +; CHECK-NEXT: vadd.vv v8, v8, v16 ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 @@ -492,16 +492,16 @@ define i32 @test_nxv256i1( %x) { ; CHECK-NEXT: addi a0, a0, 16 ; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: vadd.vi v24, v24, 1, v0.t -; CHECK-NEXT: vadd.vv v24, v24, v8 +; CHECK-NEXT: vadd.vv v0, v24, v8 ; CHECK-NEXT: addi a0, sp, 16 ; CHECK-NEXT: vl8r.v v8, (a0) # vscale x 64-byte Folded Reload ; CHECK-NEXT: csrr a0, vlenb ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: add a0, sp, a0 ; CHECK-NEXT: addi a0, a0, 16 -; CHECK-NEXT: vl8r.v v0, (a0) # vscale x 64-byte Folded Reload -; CHECK-NEXT: vadd.vv v8, v8, v0 -; CHECK-NEXT: vadd.vv v16, v24, v16 +; CHECK-NEXT: vl8r.v v24, (a0) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vadd.vv v8, v8, v24 +; CHECK-NEXT: vadd.vv v16, v0, v16 ; CHECK-NEXT: vadd.vv v8, v16, v8 ; CHECK-NEXT: vmv.s.x v16, zero ; CHECK-NEXT: vredsum.vs v8, v8, v16 @@ -537,18 +537,17 @@ entry: define i16 @test_narrow_nxv64i1( %x) { ; CHECK-LABEL: test_narrow_nxv64i1: ; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 ; CHECK-NEXT: csrr a0, vlenb -; CHECK-NEXT: vsetvli a1, zero, e16, m8, ta, ma -; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: vmerge.vim v8, v8, 1, v0 ; CHECK-NEXT: srli a0, a0, 1 ; CHECK-NEXT: vsetvli a1, zero, e8, m1, ta, ma -; CHECK-NEXT: vslidedown.vx v8, v0, a0 +; CHECK-NEXT: vslidedown.vx v0, v0, a0 ; CHECK-NEXT: vsetvli a0, zero, e16, m8, ta, mu -; CHECK-NEXT: vmerge.vim v16, v16, 1, v0 -; CHECK-NEXT: vmv1r.v v0, v8 -; CHECK-NEXT: vadd.vi v16, v16, 1, v0.t -; CHECK-NEXT: vmv.s.x v8, zero -; CHECK-NEXT: vredsum.vs v8, v16, v8 +; CHECK-NEXT: vadd.vi v8, v8, 1, v0.t +; CHECK-NEXT: vmv.s.x v16, zero +; CHECK-NEXT: vredsum.vs v8, v8, v16 ; CHECK-NEXT: vmv.x.s a0, v8 ; CHECK-NEXT: ret entry: diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll index 77723609a60c7..e297e88c71f1b 100644 --- a/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll +++ b/llvm/test/CodeGen/RISCV/rvv/vector-interleave.ll @@ -260,18 +260,18 @@ define @vector_interleave_nxv128i1_nxv64i1( @llvm.vector.interleave2.nxv128i1( %a, %b) diff --git a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll index 1e2cf4956bd08..c22a394e6c4e0 100644 --- a/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll +++ b/llvm/test/CodeGen/X86/avx10_2_512bf16-arith.ll @@ -94,8 +94,8 @@ define <32 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_512(<32 x bfloat> %src, ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_512: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: vsubbf16 %zmm2, %zmm1, %zmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xc9,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %zmm1, %zmm1 # encoding: [0x62,0xf5,0x75,0x48,0x5c,0x08] ; X86-NEXT: vsubbf16 %zmm1, %zmm0, %zmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x49,0x5c,0xc1] diff --git a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll index 42831a453cb1d..435f67a0f1e4b 100644 --- a/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll +++ b/llvm/test/CodeGen/X86/avx10_2bf16-arith.ll @@ -147,8 +147,8 @@ define <16 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_256(<16 x bfloat> %src, ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_256: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: vsubbf16 %ymm2, %ymm1, %ymm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0xa9,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %ymm1, %ymm1 # encoding: [0x62,0xf5,0x75,0x28,0x5c,0x08] ; X86-NEXT: vsubbf16 %ymm1, %ymm0, %ymm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x29,0x5c,0xc1] @@ -201,8 +201,8 @@ define <8 x bfloat> @test_int_x86_avx10_maskz_sub_bf16_128(<8 x bfloat> %src, <8 ; ; X86-LABEL: test_int_x86_avx10_maskz_sub_bf16_128: ; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: kmovb {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x08] ; X86-NEXT: vsubbf16 %xmm2, %xmm1, %xmm0 {%k1} {z} # encoding: [0x62,0xf5,0x75,0x89,0x5c,0xc2] ; X86-NEXT: vsubbf16 (%eax), %xmm1, %xmm1 # encoding: [0x62,0xf5,0x75,0x08,0x5c,0x08] ; X86-NEXT: vsubbf16 %xmm1, %xmm0, %xmm0 {%k1} # encoding: [0x62,0xf5,0x7d,0x09,0x5c,0xc1] From 5623b7f2d56ecba84de5d62444feed2dea2b7e25 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 11 Jun 2025 21:08:35 +0100 Subject: [PATCH 141/851] [LV] Use GeneratedRTChecks to check if safety checks were added (NFC). Directly check via GeneratedRTChecks if any checks have been added, instead of needing to go through ILV. This simplifies the code and enables further refactoring in follow-up patches. --- .../Transforms/Vectorize/LoopVectorize.cpp | 23 ++++++++----------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index 2a237f42e4042..d236111836391 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -505,9 +505,6 @@ class InnerLoopVectorizer { /// Fix the vectorized code, taking care of header phi's, and more. void fixVectorizedLoop(VPTransformState &State); - // Return true if any runtime check is added. - bool areSafetyChecksAdded() { return AddedSafetyChecks; } - /// Fix the non-induction PHIs in \p Plan. void fixNonInductionPHIs(VPTransformState &State); @@ -620,9 +617,6 @@ class InnerLoopVectorizer { /// The profitablity analysis. LoopVectorizationCostModel *Cost; - // Record whether runtime checks are added. - bool AddedSafetyChecks = false; - /// BFI and PSI are used to check for profile guided size optimizations. BlockFrequencyInfo *BFI; ProfileSummaryInfo *PSI; @@ -1777,6 +1771,9 @@ class GeneratedRTChecks { /// they have been used. Value *MemRuntimeCheckCond = nullptr; + /// True if any checks have been added. + bool AddedAnyChecks = false; + DominatorTree *DT; LoopInfo *LI; TargetTransformInfo *TTI; @@ -2038,9 +2035,9 @@ class GeneratedRTChecks { if (AddBranchWeights) setBranchWeights(BI, SCEVCheckBypassWeights, /*IsExpected=*/false); ReplaceInstWithInst(SCEVCheckBlock->getTerminator(), &BI); - // Mark the check as used, to prevent it from being removed during cleanup. SCEVCheckCond = nullptr; + AddedAnyChecks = true; return SCEVCheckBlock; } @@ -2070,8 +2067,12 @@ class GeneratedRTChecks { // Mark the check as used, to prevent it from being removed during cleanup. MemRuntimeCheckCond = nullptr; + AddedAnyChecks = true; return MemCheckBlock; } + + /// Return true if any runtime checks have been added + bool hasChecks() const { return AddedAnyChecks; } }; } // namespace @@ -2459,7 +2460,6 @@ BasicBlock *InnerLoopVectorizer::emitSCEVChecks(BasicBlock *Bypass) { assert((!Cost->OptForSize || Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled) && "Cannot SCEV check stride or overflow when optimizing for size"); - AddedSafetyChecks = true; introduceCheckBlockInVPlan(SCEVCheckBlock); return SCEVCheckBlock; @@ -2494,9 +2494,6 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) { }); } - - AddedSafetyChecks = true; - introduceCheckBlockInVPlan(MemCheckBlock); return MemCheckBlock; } @@ -10287,7 +10284,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { } ++LoopsEpilogueVectorized; - if (!MainILV.areSafetyChecksAdded()) + if (!Checks.hasChecks()) DisableRuntimeUnroll = true; } else { InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, @@ -10299,7 +10296,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // Add metadata to disable runtime unrolling a scalar loop when there // are no runtime checks about strides and memory. A scalar loop that is // rarely used is not worth unrolling. - if (!LB.areSafetyChecksAdded()) + if (!Checks.hasChecks()) DisableRuntimeUnroll = true; } // Report the vectorization decision. From c70658e32debfc3b2c0f6c2b2228ac48e976fd51 Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 11 Jun 2025 13:09:05 -0700 Subject: [PATCH 142/851] [bazel] port 5dafe9dca867b90f20dcd71c620ad823aee4262b --- .../llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel index 40f672d8099f1..610978059d7e6 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdlib/BUILD.bazel @@ -107,6 +107,7 @@ libc_test( deps = [ "//libc:__support_fputil_fp_bits", "//libc:atof", + "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -206,6 +207,7 @@ libc_test_library( "//libc:__support_macros_properties_architectures", "//libc:errno", "//libc/test/UnitTest:LibcUnitTest", + "//libc/test/UnitTest:errno_test_helpers", ], ) @@ -251,6 +253,7 @@ libc_test( deps = [ "//libc:__support_fputil_fp_bits", "//libc:strtof", + "//libc/test/UnitTest:errno_test_helpers", "//libc/test/UnitTest:fp_test_helpers", ], ) @@ -261,6 +264,7 @@ libc_test( deps = [ "//libc:__support_fputil_fp_bits", "//libc:strtod", + "//libc/test/UnitTest:errno_test_helpers", "//libc/test/UnitTest:fp_test_helpers", ], ) @@ -272,5 +276,6 @@ libc_test( "//libc:__support_fputil_fp_bits", "//libc:__support_uint128", "//libc:strtold", + "//libc/test/UnitTest:errno_test_helpers", ], ) From 52583b3ed7dd39788360361fc1e21039c8eb5479 Mon Sep 17 00:00:00 2001 From: Uzair Nawaz Date: Wed, 11 Jun 2025 20:11:31 +0000 Subject: [PATCH 143/851] [libc] Character converter skeleton class (#143619) Made CharacterConverter class skeleton --- libc/hdr/types/char32_t.h | 22 ++++++ libc/hdr/types/char8_t.h | 22 ++++++ libc/hdr/uchar_overlay.h | 69 +++++++++++++++++++ libc/src/__support/wchar/CMakeLists.txt | 26 +++++++ .../__support/wchar/character_converter.cpp | 32 +++++++++ .../src/__support/wchar/character_converter.h | 39 +++++++++++ libc/src/__support/wchar/mbstate.h | 27 ++++++++ libc/src/__support/wchar/utf_ret.h | 21 ++++++ 8 files changed, 258 insertions(+) create mode 100644 libc/hdr/types/char32_t.h create mode 100644 libc/hdr/types/char8_t.h create mode 100644 libc/hdr/uchar_overlay.h create mode 100644 libc/src/__support/wchar/CMakeLists.txt create mode 100644 libc/src/__support/wchar/character_converter.cpp create mode 100644 libc/src/__support/wchar/character_converter.h create mode 100644 libc/src/__support/wchar/mbstate.h create mode 100644 libc/src/__support/wchar/utf_ret.h diff --git a/libc/hdr/types/char32_t.h b/libc/hdr/types/char32_t.h new file mode 100644 index 0000000000000..94fe5747d3415 --- /dev/null +++ b/libc/hdr/types/char32_t.h @@ -0,0 +1,22 @@ +//===-- Definition of char32_t.h ------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_TYPES_CHAR32_T_H +#define LLVM_LIBC_HDR_TYPES_CHAR32_T_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-types/char32_t.h" + +#else // overlay mode + +#include "hdr/uchar_overlay.h" + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_TYPES_CHAR32_T_H diff --git a/libc/hdr/types/char8_t.h b/libc/hdr/types/char8_t.h new file mode 100644 index 0000000000000..31de764658f9e --- /dev/null +++ b/libc/hdr/types/char8_t.h @@ -0,0 +1,22 @@ +//===-- Definition of char8_t.h -------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_TYPES_CHAR8_T_H +#define LLVM_LIBC_HDR_TYPES_CHAR8_T_H + +#ifdef LIBC_FULL_BUILD + +#include "include/llvm-libc-types/char8_t.h" + +#else // overlay mode + +#include "hdr/uchar_overlay.h" + +#endif // LLVM_LIBC_FULL_BUILD + +#endif // LLVM_LIBC_HDR_TYPES_CHAR8_T_H diff --git a/libc/hdr/uchar_overlay.h b/libc/hdr/uchar_overlay.h new file mode 100644 index 0000000000000..44ed3d48c6c1d --- /dev/null +++ b/libc/hdr/uchar_overlay.h @@ -0,0 +1,69 @@ +//===-- Including uchar.h in overlay mode ---------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_HDR_UCHAR_OVERLAY_H +#define LLVM_LIBC_HDR_UCHAR_OVERLAY_H + +#ifdef LIBC_FULL_BUILD +#error "This header should only be included in overlay mode" +#endif + +// Overlay mode + +// glibc header might provide extern inline definitions for few +// functions, causing external alias errors. They are guarded by +// `__USE_EXTERN_INLINES` macro. We temporarily disable `__USE_EXTERN_INLINES` +// macro by defining `__NO_INLINE__` before including . +// And the same with `__USE_FORTIFY_LEVEL`, which will be temporarily disabled +// with `_FORTIFY_SOURCE`. + +#ifdef _FORTIFY_SOURCE +#define LIBC_OLD_FORTIFY_SOURCE _FORTIFY_SOURCE +#undef _FORTIFY_SOURCE +#endif + +#ifndef __NO_INLINE__ +#define __NO_INLINE__ 1 +#define LIBC_SET_NO_INLINE +#endif + +#ifdef __USE_EXTERN_INLINES +#define LIBC_OLD_USE_EXTERN_INLINES +#undef __USE_EXTERN_INLINES +#endif + +#ifdef __USE_FORTIFY_LEVEL +#define LIBC_OLD_USE_FORTIFY_LEVEL __USE_FORTIFY_LEVEL +#undef __USE_FORTIFY_LEVEL +#define __USE_FORTIFY_LEVEL 0 +#endif + +#include + +#ifdef LIBC_OLD_FORTIFY_SOURCE +#define _FORTIFY_SOURCE LIBC_OLD_FORTIFY_SOURCE +#undef LIBC_OLD_FORTIFY_SOURCE +#endif + +#ifdef LIBC_SET_NO_INLINE +#undef __NO_INLINE__ +#undef LIBC_SET_NO_INLINE +#endif + +#ifdef LIBC_OLD_USE_FORTIFY_LEVEL +#undef __USE_FORTIFY_LEVEL +#define __USE_FORTIFY_LEVEL LIBC_OLD_USE_FORTIFY_LEVEL +#undef LIBC_OLD_USE_FORTIFY_LEVEL +#endif + +#ifdef LIBC_OLD_USE_EXTERN_INLINES +#define __USE_EXTERN_INLINES +#undef LIBC_OLD_USE_EXTERN_INLINES +#endif + +#endif // LLVM_LIBC_HDR_UCHAR_OVERLAY_H diff --git a/libc/src/__support/wchar/CMakeLists.txt b/libc/src/__support/wchar/CMakeLists.txt new file mode 100644 index 0000000000000..5cca58400ff45 --- /dev/null +++ b/libc/src/__support/wchar/CMakeLists.txt @@ -0,0 +1,26 @@ +add_header_library( + mbstate + HDRS + mbstate.h + DEPENDS + libc.hdr.types.char32_t +) + +add_object_library( + character_converter + HDRS + character_converter.h + SRCS + character_converter.cpp + DEPENDS + libc.hdr.types.char8_t + libc.hdr.types.char32_t + .mbstate + .utf_ret +) + +add_header_library( + utf_ret + HDRS + utf_ret.h +) diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp new file mode 100644 index 0000000000000..0afc2a6f59e64 --- /dev/null +++ b/libc/src/__support/wchar/character_converter.cpp @@ -0,0 +1,32 @@ +//===-- Implementation of a class for conversion --------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "hdr/types/char32_t.h" +#include "hdr/types/char8_t.h" +#include "src/__support/wchar/mbstate.h" +#include "src/__support/wchar/utf_ret.h" + +#include "character_converter.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +CharacterConverter::CharacterConverter(mbstate_t *mbstate) { state = mbstate; } + +bool CharacterConverter::isComplete() {} + +int CharacterConverter::push(char8_t utf8_byte) {} + +int CharacterConverter::push(char32_t utf32) {} + +utf_ret CharacterConverter::pop_utf8() {} + +utf_ret CharacterConverter::pop_utf32() {} + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h new file mode 100644 index 0000000000000..a6bac43805376 --- /dev/null +++ b/libc/src/__support/wchar/character_converter.h @@ -0,0 +1,39 @@ +//===-- Definition of a class for mbstate_t and conversion -----*-- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_CHARACTER_CONVERTER_H +#define LLVM_LIBC_SRC___SUPPORT_CHARACTER_CONVERTER_H + +#include "hdr/types/char32_t.h" +#include "hdr/types/char8_t.h" +#include "src/__support/wchar/mbstate.h" +#include "src/__support/wchar/utf_ret.h" + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +class CharacterConverter { +private: + mbstate_t *state; + +public: + CharacterConverter(mbstate_t *mbstate); + + bool isComplete(); + + int push(char8_t utf8_byte); + int push(char32_t utf32); + + utf_ret pop_utf8(); + utf_ret pop_utf32(); +}; + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_CHARACTER_CONVERTER_H diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h new file mode 100644 index 0000000000000..72ec727560003 --- /dev/null +++ b/libc/src/__support/wchar/mbstate.h @@ -0,0 +1,27 @@ +//===-- Definition of mbstate-----------------------------------*-- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_MBSTATE_H +#define LLVM_LIBC_SRC___SUPPORT_MBSTATE_H + +#include "hdr/types/char32_t.h" +#include + +namespace LIBC_NAMESPACE_DECL { +namespace internal { + +struct mbstate { + char32_t partial; + uint8_t bits_processed; + uint8_t total_bytes; +}; + +} // namespace internal +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_MBSTATE_H diff --git a/libc/src/__support/wchar/utf_ret.h b/libc/src/__support/wchar/utf_ret.h new file mode 100644 index 0000000000000..b8a8f6f094143 --- /dev/null +++ b/libc/src/__support/wchar/utf_ret.h @@ -0,0 +1,21 @@ +//===-- Definition of utf_ret ----------------------------------*-- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_UTF_RET_H +#define LLVM_LIBC_SRC___SUPPORT_UTF_RET_H + +namespace LIBC_NAMESPACE_DECL { + +template struct utf_ret { + T out; + int error; +}; + +} // namespace LIBC_NAMESPACE_DECL + +#endif // LLVM_LIBC_SRC___SUPPORT_UTF_RET_H From a2d2941830d9c141d7f43da1ff58e7b7235a9f7d Mon Sep 17 00:00:00 2001 From: Chelsea Cassanova Date: Wed, 11 Jun 2025 13:12:37 -0700 Subject: [PATCH 144/851] [lldb][RPC] Upstream LLDB to RPC converstion Python script (#138028) As part of upstreaming LLDB RPC, this commit adds a python script that is used by LLDB RPC to modify the public lldb header files for use with RPC. https://discourse.llvm.org/t/rfc-upstreaming-lldb-rpc/85804 --- .../convert-lldb-header-to-rpc-header.py | 108 ++++++++++++++++++ .../TestConvertScript/CheckLLDBDefines.test | 22 ++++ .../CheckLLDBEnumerations.test | 17 +++ .../TestConvertScript/CheckLLDBTypes.test | 24 ++++ .../TestConvertScript/CheckSBDefines.test | 22 ++++ .../TestConvertScript/Inputs/SBDefines.h | 22 ++++ .../TestConvertScript/Inputs/lldb-defines.h | 23 ++++ .../Inputs/lldb-enumerations.h | 17 +++ .../TestConvertScript/Inputs/lldb-types.h | 23 ++++ 9 files changed, 278 insertions(+) create mode 100755 lldb/scripts/convert-lldb-header-to-rpc-header.py create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h create mode 100644 lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h diff --git a/lldb/scripts/convert-lldb-header-to-rpc-header.py b/lldb/scripts/convert-lldb-header-to-rpc-header.py new file mode 100755 index 0000000000000..d7734280076ff --- /dev/null +++ b/lldb/scripts/convert-lldb-header-to-rpc-header.py @@ -0,0 +1,108 @@ +#!/usr/bin/env python3 +""" +Usage: convert-lldb-header-to-rpc-header.py + +This scripts takes common LLDB headers (such as lldb-defines.h) and replaces references to LLDB +with those for RPC. This happens for: +- namespace definitions +- namespace usage +- version string macros +- ifdef/ifndef lines +""" + +import argparse +import os +import re + + +INCLUDES_TO_REMOVE_REGEX = re.compile( + r'#include "lldb/lldb-forward.h"|#include "lldb/lldb-versioning.h"' +) +LLDB_GUARD_REGEX = re.compile(r"(?P#.+)LLDB_LLDB_\s*", re.M) +LLDB_API_GUARD_REGEX = re.compile(r"(?P#.+)LLDB_API_\s*", re.M) +LLDB_VERSION_REGEX = re.compile(r"#define LLDB_VERSION", re.M) +LLDB_REVISION_REGEX = re.compile(r"#define LLDB_REVISION", re.M) +LLDB_VERSION_STRING_REGEX = re.compile(r"#define LLDB_VERSION_STRING", re.M) +LLDB_LOCAL_INCLUDE_REGEX = re.compile(r'#include "lldb/lldb-\s*', re.M) +LLDB_NAMESPACE_DEFINITION_REGEX = re.compile( + r"(?P//\s*){,1}namespace lldb\s{1}", re.M +) +LLDB_NAMESPACE_REGEX = re.compile(r"\s*.+lldb::\s*", re.M) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("input") + parser.add_argument("output") + args = parser.parse_args() + input_path = str(args.input) + output_path = str(args.output) + with open(input_path, "r") as input_file: + lines = input_file.readlines() + file_buffer = "".join(lines) + + with open(output_path, "w") as output_file: + # NOTE: We do not use lldb-forward.h or lldb-versioning.h in RPC, so remove + # all includes that are found for these files. + file_buffer = re.sub(INCLUDES_TO_REMOVE_REGEX, r"", file_buffer) + + # For lldb-rpc-defines.h, replace the ifndef LLDB_LLDB_ portion with LLDB_RPC_ as we're not + # using LLDB private definitions in RPC. + lldb_guard_matches = LLDB_GUARD_REGEX.finditer(file_buffer) + for match in lldb_guard_matches: + file_buffer = re.sub( + match.group(), + r"{0}LLDB_RPC_".format(match.group("guard_type")), + file_buffer, + ) + + # Similarly to lldb-rpc-defines.h, replace the ifndef for LLDB_API in SBDefines.h to LLDB_RPC_API_ for the same reason. + lldb_api_guard_matches = LLDB_API_GUARD_REGEX.finditer(file_buffer) + for match in lldb_api_guard_matches: + file_buffer = re.sub( + match.group(), + r"{0}LLDB_RPC_API_".format(match.group("guard_type")), + file_buffer, + ) + + # Replace the references for the macros that define the versioning strings in + # lldb-rpc-defines.h. + # NOTE: Here we assume that the versioning info has already been uncommented and + # populated from the original lldb-defines.h. + file_buffer = re.sub( + LLDB_VERSION_REGEX, r"#define LLDB_RPC_VERSION", file_buffer + ) + file_buffer = re.sub( + LLDB_REVISION_REGEX, r"#define LLDB_RPC_REVISION", file_buffer + ) + file_buffer = re.sub( + LLDB_VERSION_STRING_REGEX, r"#define LLDB_RPC_VERSION_STRING", file_buffer + ) + + # For local #includes + file_buffer = re.sub( + LLDB_LOCAL_INCLUDE_REGEX, r'#include "lldb-rpc-', file_buffer + ) + + # Rename the lldb namespace definition to lldb-rpc. + lldb_rpc_namespace_definition_matches = ( + LLDB_NAMESPACE_DEFINITION_REGEX.finditer(file_buffer) + ) + for match in lldb_rpc_namespace_definition_matches: + comment_marker = ( + match.group("comment_marker") if match.group("comment_marker") else "" + ) + file_buffer = re.sub( + match.group(), + r"{0}namespace lldb_rpc ".format(comment_marker), + file_buffer, + ) + + # Rename the lldb namespace definition to lldb-rpc. + file_buffer = re.sub(LLDB_NAMESPACE_REGEX, r"lldb_rpc::", file_buffer) + + output_file.write(file_buffer) + + +if __name__ == "__main__": + main() diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test new file mode 100644 index 0000000000000..0d89d627cfedf --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBDefines.test @@ -0,0 +1,22 @@ +RUN: mkdir -p %t/Outputs + +# Run the convert script on lldb-defines.h. +RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/lldb-defines.h %t/Outputs/lldb-rpc-defines.h + +# Check the output +RUN: cat %t/Outputs/lldb-rpc-defines.h | FileCheck %s + +# The include guards must change from LLDB_LLDB_DEFINES_H to LLDB_RPC_DEFINES_H. +CHECK: #ifndef LLDB_RPC_DEFINES_H +CHECK: #define LLDB_RPC_DEFINES_H + +# Includes of other lldb headers must begin with "lldb-rpc-". +CHECK: #include "lldb-rpc-types.h" + +# The version info must be changed from LLDB_VERSION to LLDB_RPC_VERSION +CHECK: #define LLDB_RPC_VERSION 21 +CHECK: #define LLDB_RPC_REVISION 12 +CHECK: #define LLDB_RPC_VERSION_STRING "21.0.12" + +# The comment that closes the include guard should match the guard. +CHECK: #endif // LLDB_RPC_DEFINES_H diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test new file mode 100644 index 0000000000000..0fb3c6f73dd0f --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBEnumerations.test @@ -0,0 +1,17 @@ +RUN: mkdir -p %t/Outputs + +# Run the convert script on lldb-enumerations.h. +RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/lldb-enumerations.h %t/Outputs/lldb-rpc-enumerations.h + +# Check the output +RUN: cat %t/Outputs/lldb-rpc-enumerations.h | FileCheck %s + +# The include guards must change from LLDB_LLDB_ENUMERATIONS_H to LLDB_RPC_ENUMERATIONS_H. +CHECK: #ifndef LLDB_RPC_ENUMERATIONS_H +CHECK: #define LLDB_RPC_ENUMERATIONS_H + +# Change the namespace to lldb_rpc. Also, the comment that closes the namespace should match the namespace. +CHECK: namespace lldb_rpc {} // namespace lldb_rpc + +# The comment that closes the include guard should match the guard. +CHECK: #endif // LLDB_RPC_ENUMERATIONS_H diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test new file mode 100644 index 0000000000000..86f2d290209e1 --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckLLDBTypes.test @@ -0,0 +1,24 @@ +RUN: mkdir -p %t/Outputs + +# Run the convert script on lldb-types.h. +RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/lldb-types.h %t/Outputs/lldb-rpc-types.h + +# Check the output +RUN: cat %t/Outputs/lldb-rpc-types.h | FileCheck %s + +# The include guards must change from LLDB_LLDB_TYPES_H to LLDB_RPC_TYPES_H. +CHECK: #ifndef LLDB_RPC_TYPES_H +CHECK: #define LLDB_RPC_TYPES_H + +# Includes of other lldb headers must begin with "lldb-rpc-". +# Also, the includes for lldb-forward.h should be removed. +CHECK: #include "lldb-rpc-enumerations.h" + +# Change the namespace to lldb_rpc. +CHECK: namespace lldb_rpc + +# The comment that closes the namespace should match the namespace. +CHECK: // namespace lldb_rpc + +# The comment that closes the include guard should match the guard. +CHECK: #endif // LLDB_RPC_TYPES_H diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test new file mode 100644 index 0000000000000..72444aaf069a4 --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/CheckSBDefines.test @@ -0,0 +1,22 @@ +RUN: mkdir -p %t/Outputs + +# Run the convert script on SBDefines.h. +RUN: %python %p/../../../../../scripts/convert-lldb-header-to-rpc-header.py %p/Inputs/SBDefines.h %t/Outputs/SBDefines.h + +# Check the output +RUN: cat %t/Outputs/SBDefines.h | FileCheck %s + +# The include guards must change from LLDB_LLDB_API_SBDEFINES_H to LLDB_RPC_API_SBDEFINES_H. +CHECK: #ifndef LLDB_RPC_API_SBDEFINES_H +CHECK: #define LLDB_RPC_API_SBDEFINES_H + +# Includes of other lldb headers must begin with "lldb-rpc-". +# Also, the includes for lldb-forward.h and lldb-versioning.h should be removed. +CHECK: #include "lldb-rpc-defines.h" +CHECK-NOT: #include "lldb-rpc-forward.h" +CHECK: #include "lldb-rpc-enumerations.h" +CHECK: #include "lldb-rpc-types.h" +CHECK-NOT: #include "lldb-rpc-versioning.h" + +# The comment that closes the include guard should match the guard. +CHECK: #endif // LLDB_RPC_API_SBDEFINES_H diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h new file mode 100644 index 0000000000000..50476c402ba72 --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/SBDefines.h @@ -0,0 +1,22 @@ +// This is a truncated version of SBDefines.h used to test that the script +// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in +// the original file to RPC references. + +// The include guard should change from LLDB_LLDB to LLDB_RPC. +// LLDB_API_SBDEFINES_H -> LLDB_RPC_SBDEFINES_H +#ifndef LLDB_API_SBDEFINES_H +#define LLDB_API_SBDEFINES_H + +// Includes of public main LLDB headers should change to their RPC equivalents: +// "lldb/lldb-defines.h" -> "lldb-rpc-defines.h" +// Also, the includes for lldb-forward.h and lldb-versioning.h should be removed. +#include "lldb/lldb-defines.h" +#include "lldb/lldb-enumerations.h" +#include "lldb/lldb-forward.h" +#include "lldb/lldb-types.h" +#include "lldb/lldb-versioning.h" + +// The comment that closes the include guard must change in the same way +// the original guard did. +// #endif // LLDB_API_SBDEFINES_H -> #endif // LLDB_RPC_API_SBDEFINES_H +#endif // LLDB_API_SBDEFINES_H diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h new file mode 100644 index 0000000000000..32064430b3d04 --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-defines.h @@ -0,0 +1,23 @@ +// This is a truncated version of lldb-defines.h used to test that the script +// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in +// the original file to RPC references. + +// The include guard should change from LLDB_LLDB to LLDB_RPC. +// LLDB_LLDB_DEFINES_H -> LLDB_RPC_DEFINES_H +#ifndef LLDB_LLDB_DEFINES_H +#define LLDB_LLDB_DEFINES_H + +// Includes of public main LLDB headers should change to their RPC equivalents: +// "lldb/lldb-types.h" -> "lldb-rpc-types.h" +#include "lldb/lldb-types.h" + +// The LLDB version must change from LLDB to LLDB_RPC +// LLDB_VERSION -> LLDB_RPC_VERSION +#define LLDB_VERSION 21 +#define LLDB_REVISION 12 +#define LLDB_VERSION_STRING "21.0.12" + +// The comment that closes the include guard must change in the same way +// the original guard did. +// #endif // LLDB_LLDB_DEFINES_H -> #endif // LLDB_RPC_DEFINES_H +#endif // LLDB_LLDB_DEFINES_H diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h new file mode 100644 index 0000000000000..42c4bb277fc45 --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-enumerations.h @@ -0,0 +1,17 @@ +// This is a truncated version of lldb-enumerations.h used to test that the script +// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in +// the original file to RPC references. + +// The include guard should change from LLDB_LLDB to LLDB_RPC. +// LLDB_LLDB_ENUMERATIONS_H -> LLDB_RPC_ENUMERATIONS_H +#ifndef LLDB_LLDB_ENUMERATIONS_H +#define LLDB_LLDB_ENUMERATIONS_H + +// The namespace definition should change to the lldb_rpc namespace, so should the comment that closes it: +// namespace lldb -> namespace lldb_rpc +namespace lldb {} // namespace lldb + +// The comment that closes the include guard must change in the same way +// the original guard did: +// #endif // LLDB_LLDB_ENUMERATIONS_H -> #endif // LLDB_RPC_ENUMERATIONS_H +#endif // LLDB_LLDB_ENUMERATIONS_H diff --git a/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h new file mode 100644 index 0000000000000..5a49920405ec6 --- /dev/null +++ b/lldb/test/Shell/RPC/Scripts/TestConvertScript/Inputs/lldb-types.h @@ -0,0 +1,23 @@ +// This is a truncated version of lldb-types.h used to test that the script +// convert-lldb-header-to-rpc-header.py works correctly. The script changes LLDB references in +// the original file to RPC references. + +// The include guard should change from LLDB_LLDB to LLDB_RPC. +// LLDB_LLDB_TYPES_H -> LLDB_RPC_TYPES_H +#ifndef LLDB_LLDB_TYPES_H +#define LLDB_LLDB_TYPES_H + +// Includes of public main LLDB headers should change to their RPC equivalents: +// "lldb/lldb-defines.h" -> "lldb-rpc-defines.h": +// Also, the includes for lldb-forward.h should be removed. +#include "lldb/lldb-enumerations.h" +#include "lldb/lldb-forward.h" + +// The namespace definition should change to the lldb_rpc namespace, so should the comment that closes it: +// namespace lldb -> namespace lldb_rpc +namespace lldb {} // namespace lldb + +// The comment that closes the include guard must change in the same way +// the original guard did: +// #endif // LLDB_LLDB_TYPES_H -> #endif // LLDB_RPC_TYPES_H +#endif // LLDB_LLDB_TYPES_H From b42aef5e6f32a3ac6c259cb4cacf58239400b5aa Mon Sep 17 00:00:00 2001 From: Peter Klausler Date: Wed, 11 Jun 2025 13:12:59 -0700 Subject: [PATCH 145/851] [flang] Don't duplicate hermetic module file dependencies (#143605) When emitting the modules on which a module depends under the -fhermetic-module-files options, eliminate duplicates by name rather than by symbol addresses. This way, when a dependent module is in the symbol table more than once due to the use of a nested hermetic module, it doesn't get emitted multiple times to the new module file. --- flang/lib/Semantics/mod-file.cpp | 18 +++++++++------ flang/test/Semantics/modfile77.F90 | 37 ++++++++++++++++++++++++++++++ flang/test/Semantics/modfile78.F90 | 33 ++++++++++++++++++++++++++ 3 files changed, 81 insertions(+), 7 deletions(-) create mode 100644 flang/test/Semantics/modfile77.F90 create mode 100644 flang/test/Semantics/modfile78.F90 diff --git a/flang/lib/Semantics/mod-file.cpp b/flang/lib/Semantics/mod-file.cpp index a72641866aa15..9f9e9f5840456 100644 --- a/flang/lib/Semantics/mod-file.cpp +++ b/flang/lib/Semantics/mod-file.cpp @@ -143,18 +143,22 @@ void ModFileWriter::Write(const Symbol &symbol) { std::string path{context_.moduleDirectory() + '/' + ModFileName(symbol.name(), ancestorName, context_.moduleFileSuffix())}; - UnorderedSymbolSet hermeticModules; - hermeticModules.insert(symbol); + std::set hermeticModuleNames; + hermeticModuleNames.insert(symbol.name().ToString()); UnorderedSymbolSet additionalModules; PutSymbols(DEREF(symbol.scope()), hermeticModuleFileOutput_ ? &additionalModules : nullptr); auto asStr{GetAsString(symbol)}; while (!additionalModules.empty()) { - for (auto ref : UnorderedSymbolSet{std::move(additionalModules)}) { - if (hermeticModules.insert(*ref).second && - !ref->owner().IsIntrinsicModules()) { - PutSymbols(DEREF(ref->scope()), &additionalModules); - asStr += GetAsString(*ref); + UnorderedSymbolSet nextPass{std::move(additionalModules)}; + additionalModules.clear(); + for (const Symbol &modSym : nextPass) { + if (!modSym.owner().IsIntrinsicModules() && + hermeticModuleNames.find(modSym.name().ToString()) == + hermeticModuleNames.end()) { + hermeticModuleNames.insert(modSym.name().ToString()); + PutSymbols(DEREF(modSym.scope()), &additionalModules); + asStr += GetAsString(modSym); } } } diff --git a/flang/test/Semantics/modfile77.F90 b/flang/test/Semantics/modfile77.F90 new file mode 100644 index 0000000000000..a82904ebbcc22 --- /dev/null +++ b/flang/test/Semantics/modfile77.F90 @@ -0,0 +1,37 @@ +!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c -fhermetic-module-files %s && cat modfile77c.mod | FileCheck %s + +#if WHICH == 1 +module modfile77a + interface gen + procedure proc + end interface + contains + subroutine proc + print *, 'ok' + end +end +#elif WHICH == 2 +module modfile77b + use modfile77a +end +#else +module modfile77c + use modfile77a + use modfile77b +end +#endif + +!CHECK: module modfile77c +!CHECK: use modfile77a,only:proc +!CHECK: use modfile77a,only:gen +!CHECK: interface gen +!CHECK: end interface +!CHECK: end +!CHECK: module modfile77a +!CHECK: interface gen +!CHECK: procedure::proc +!CHECK: end interface +!CHECK: contains +!CHECK: subroutine proc() +!CHECK: end +!CHECK: end diff --git a/flang/test/Semantics/modfile78.F90 b/flang/test/Semantics/modfile78.F90 new file mode 100644 index 0000000000000..cb3eccd9a4108 --- /dev/null +++ b/flang/test/Semantics/modfile78.F90 @@ -0,0 +1,33 @@ +!RUN: %flang -c -fhermetic-module-files -DWHICH=1 %s && %flang -c -fhermetic-module-files -DWHICH=2 %s && %flang -c -fhermetic-module-files %s && cat modfile78c.mod | FileCheck %s + +#if WHICH == 1 +module modfile78a + integer :: global_variable = 0 +end +#elif WHICH == 2 +module modfile78b + use modfile78a + contains + subroutine test + end +end +#else +module modfile78c + use modfile78a + use modfile78b +end +#endif + +!CHECK: module modfile78c +!CHECK: use modfile78a,only:global_variable +!CHECK: use modfile78b,only:test +!CHECK: end +!CHECK: module modfile78a +!CHECK: integer(4)::global_variable +!CHECK: end +!CHECK: module modfile78b +!CHECK: use modfile78a,only:global_variable +!CHECK: contains +!CHECK: subroutine test() +!CHECK: end +!CHECK: end From e389a0e7bb3d7aabbd10b9ba8f432f292de65649 Mon Sep 17 00:00:00 2001 From: Uzair Nawaz Date: Wed, 11 Jun 2025 20:17:35 +0000 Subject: [PATCH 146/851] [libc] Switched calls to inline_memcpy to __builtin_memcpy for wide char utilities (#143011) Switched calls to inline_memcpy to __builtin_memcpy for wide char utilities Removed unnecessary wctype_utils dependencies from the cmake file --- libc/src/wchar/CMakeLists.txt | 9 --------- libc/src/wchar/wcscpy.cpp | 3 +-- libc/src/wchar/wcsncpy.cpp | 2 -- libc/src/wchar/wmemcpy.cpp | 3 +-- libc/src/wchar/wmempcpy.cpp | 3 +-- 5 files changed, 3 insertions(+), 17 deletions(-) diff --git a/libc/src/wchar/CMakeLists.txt b/libc/src/wchar/CMakeLists.txt index 759f708c2247a..4b8802ede5f5d 100644 --- a/libc/src/wchar/CMakeLists.txt +++ b/libc/src/wchar/CMakeLists.txt @@ -43,7 +43,6 @@ add_entrypoint_object( DEPENDS libc.hdr.types.size_t libc.hdr.types.wchar_t - libc.src.__support.wctype_utils ) add_entrypoint_object( @@ -54,7 +53,6 @@ add_entrypoint_object( wcschr.h DEPENDS libc.hdr.wchar_macros - libc.src.__support.wctype_utils ) add_entrypoint_object( @@ -75,7 +73,6 @@ add_entrypoint_object( wcspbrk.h DEPENDS libc.hdr.wchar_macros - libc.src.__support.wctype_utils libc.src.__support.macros.null_check ) @@ -109,7 +106,6 @@ add_entrypoint_object( DEPENDS libc.hdr.wchar_macros libc.hdr.types.size_t - libc.src.__support.wctype_utils ) add_entrypoint_object( @@ -121,7 +117,6 @@ add_entrypoint_object( DEPENDS libc.hdr.types.size_t libc.hdr.wchar_macros - libc.src.__support.wctype_utils libc.src.__support.macros.null_check ) @@ -134,7 +129,6 @@ add_entrypoint_object( DEPENDS libc.hdr.types.size_t libc.hdr.wchar_macros - libc.src.__support.wctype_utils ) add_entrypoint_object( @@ -205,8 +199,6 @@ add_entrypoint_object( DEPENDS libc.hdr.types.size_t libc.hdr.wchar_macros - libc.src.__support.wctype_utils - libc.src.string.memory_utils.inline_memcpy ) add_entrypoint_object( @@ -218,6 +210,5 @@ add_entrypoint_object( DEPENDS libc.hdr.types.size_t libc.hdr.wchar_macros - libc.src.string.memory_utils.inline_memcpy libc.src.string.string_utils ) diff --git a/libc/src/wchar/wcscpy.cpp b/libc/src/wchar/wcscpy.cpp index dc46b972c59f7..01ba994cecbb2 100644 --- a/libc/src/wchar/wcscpy.cpp +++ b/libc/src/wchar/wcscpy.cpp @@ -12,7 +12,6 @@ #include "hdr/types/wchar_t.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" -#include "src/string/memory_utils/inline_memcpy.h" #include "src/string/string_utils.h" namespace LIBC_NAMESPACE_DECL { @@ -20,7 +19,7 @@ namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(wchar_t *, wcscpy, (wchar_t *__restrict s1, const wchar_t *__restrict s2)) { size_t size = internal::string_length(s2) + 1; - inline_memcpy(s1, s2, size * sizeof(wchar_t)); + __builtin_memcpy(s1, s2, size * sizeof(wchar_t)); return s1; } diff --git a/libc/src/wchar/wcsncpy.cpp b/libc/src/wchar/wcsncpy.cpp index e7ae9a4a0da79..7ad6730cd776b 100644 --- a/libc/src/wchar/wcsncpy.cpp +++ b/libc/src/wchar/wcsncpy.cpp @@ -12,8 +12,6 @@ #include "hdr/types/wchar_t.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" -#include "src/string/memory_utils/inline_memcpy.h" -#include "src/string/string_utils.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/wchar/wmemcpy.cpp b/libc/src/wchar/wmemcpy.cpp index 56708d6cee496..bf92309b20944 100644 --- a/libc/src/wchar/wmemcpy.cpp +++ b/libc/src/wchar/wmemcpy.cpp @@ -12,14 +12,13 @@ #include "hdr/types/wchar_t.h" #include "src/__support/common.h" #include "src/__support/macros/config.h" -#include "src/string/memory_utils/inline_memcpy.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(wchar_t *, wmemcpy, (wchar_t *__restrict s1, const wchar_t *__restrict s2, size_t n)) { - inline_memcpy(s1, s2, n * sizeof(wchar_t)); + __builtin_memcpy(s1, s2, n * sizeof(wchar_t)); return s1; } diff --git a/libc/src/wchar/wmempcpy.cpp b/libc/src/wchar/wmempcpy.cpp index d8b89c0a88d05..21e16210a757a 100644 --- a/libc/src/wchar/wmempcpy.cpp +++ b/libc/src/wchar/wmempcpy.cpp @@ -11,14 +11,13 @@ #include "hdr/types/size_t.h" #include "hdr/types/wchar_t.h" #include "src/__support/common.h" -#include "src/string/memory_utils/inline_memcpy.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(wchar_t *, wmempcpy, (wchar_t *__restrict to, const wchar_t *__restrict from, size_t size)) { - inline_memcpy(to, from, size * sizeof(wchar_t)); + __builtin_memcpy(to, from, size * sizeof(wchar_t)); return reinterpret_cast(to) + size; } From fb761aa38b0bc01ab911f5dbbfb474b70aaafbb4 Mon Sep 17 00:00:00 2001 From: Rolf Morel Date: Wed, 11 Jun 2025 21:19:52 +0100 Subject: [PATCH 147/851] [MLIR][Transform] apply_registered_op fixes: arg order & python options auto-conversion (#143779) --- .../mlir/Dialect/Transform/IR/TransformOps.td | 6 +++--- .../mlir/dialects/transform/__init__.py | 18 +++++++++++------- .../Transform/test-pass-application.mlir | 19 +++++++++---------- mlir/test/python/dialects/transform.py | 10 +++++----- 4 files changed, 28 insertions(+), 25 deletions(-) diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td index f75ba27e58e76..0aa750e625436 100644 --- a/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td +++ b/mlir/include/mlir/Dialect/Transform/IR/TransformOps.td @@ -434,10 +434,10 @@ def ApplyRegisteredPassOp : TransformDialectOp<"apply_registered_pass", of targeted ops. }]; - let arguments = (ins StrAttr:$pass_name, + let arguments = (ins TransformHandleTypeInterface:$target, + StrAttr:$pass_name, DefaultValuedAttr:$options, - Variadic:$dynamic_options, - TransformHandleTypeInterface:$target); + Variadic:$dynamic_options); let results = (outs TransformHandleTypeInterface:$result); let assemblyFormat = [{ $pass_name (`with` `options` `=` diff --git a/mlir/python/mlir/dialects/transform/__init__.py b/mlir/python/mlir/dialects/transform/__init__.py index 10a04b0cc14e0..bfe96b1b3e5d4 100644 --- a/mlir/python/mlir/dialects/transform/__init__.py +++ b/mlir/python/mlir/dialects/transform/__init__.py @@ -224,13 +224,13 @@ class ApplyRegisteredPassOp(ApplyRegisteredPassOp): def __init__( self, result: Type, - pass_name: Union[str, StringAttr], target: Union[Operation, Value, OpView], + pass_name: Union[str, StringAttr], *, options: Optional[ Dict[ Union[str, StringAttr], - Union[Attribute, Value, Operation, OpView], + Union[Attribute, Value, Operation, OpView, str, int, bool], ] ] = None, loc=None, @@ -253,17 +253,21 @@ def __init__( cur_param_operand_idx += 1 elif isinstance(value, Attribute): options_dict[key] = value + # The following cases auto-convert Python values to attributes. + elif isinstance(value, bool): + options_dict[key] = BoolAttr.get(value) + elif isinstance(value, int): + default_int_type = IntegerType.get_signless(64, context) + options_dict[key] = IntegerAttr.get(default_int_type, value) elif isinstance(value, str): options_dict[key] = StringAttr.get(value) else: raise TypeError(f"Unsupported option type: {type(value)}") - if len(options_dict) > 0: - print(options_dict, cur_param_operand_idx) super().__init__( result, + _get_op_result_or_value(target), pass_name, dynamic_options, - target=_get_op_result_or_value(target), options=DictAttr.get(options_dict), loc=loc, ip=ip, @@ -272,13 +276,13 @@ def __init__( def apply_registered_pass( result: Type, - pass_name: Union[str, StringAttr], target: Union[Operation, Value, OpView], + pass_name: Union[str, StringAttr], *, options: Optional[ Dict[ Union[str, StringAttr], - Union[Attribute, Value, Operation, OpView], + Union[Attribute, Value, Operation, OpView, str, int, bool], ] ] = None, loc=None, diff --git a/mlir/test/Dialect/Transform/test-pass-application.mlir b/mlir/test/Dialect/Transform/test-pass-application.mlir index 6e6d4eb7e249f..1d1be9eda3496 100644 --- a/mlir/test/Dialect/Transform/test-pass-application.mlir +++ b/mlir/test/Dialect/Transform/test-pass-application.mlir @@ -157,7 +157,7 @@ module attributes {transform.with_named_sequence} { "test-convergence" = true, "max-num-rewrites" = %max_rewrites } to %1 - : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op transform.yield } } @@ -171,7 +171,6 @@ func.func @invalid_options_as_str() { module attributes {transform.with_named_sequence} { transform.named_sequence @__transform_main(%arg1: !transform.any_op) { %1 = transform.structured.match ops{["func.func"]} in %arg1 : (!transform.any_op) -> !transform.any_op - %max_iter = transform.param.constant "max-iterations=10" -> !transform.any_param // expected-error @+2 {{expected '{' in options dictionary}} %2 = transform.apply_registered_pass "canonicalize" with options = "top-down=false" to %1 : (!transform.any_op) -> !transform.any_op @@ -256,7 +255,7 @@ module attributes {transform.with_named_sequence} { // expected-error @+2 {{expected '{' in options dictionary}} transform.apply_registered_pass "canonicalize" with options = %pass_options to %1 - : (!transform.any_param, !transform.any_op) -> !transform.any_op + : (!transform.any_op, !transform.any_param) -> !transform.any_op transform.yield } } @@ -276,7 +275,7 @@ module attributes {transform.with_named_sequence} { // expected-error @below {{options passed as a param must have a single value associated, param 0 associates 2}} transform.apply_registered_pass "canonicalize" with options = { "top-down" = %topdown_options } to %1 - : (!transform.any_param, !transform.any_op) -> !transform.any_op + : (!transform.any_op, !transform.any_param) -> !transform.any_op transform.yield } } @@ -316,12 +315,12 @@ module attributes {transform.with_named_sequence} { %0 = "transform.structured.match"(%arg0) <{ops = ["func.func"]}> : (!transform.any_op) -> !transform.any_op %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param // expected-error @below {{dynamic option index 1 is out of bounds for the number of dynamic options: 1}} - %2 = "transform.apply_registered_pass"(%1, %0) <{ + %2 = "transform.apply_registered_pass"(%0, %1) <{ options = {"max-iterations" = #transform.param_operand, "test-convergence" = true, "top-down" = false}, pass_name = "canonicalize"}> - : (!transform.any_param, !transform.any_op) -> !transform.any_op + : (!transform.any_op, !transform.any_param) -> !transform.any_op "transform.yield"() : () -> () }) : () -> () }) {transform.with_named_sequence} : () -> () @@ -340,13 +339,13 @@ module attributes {transform.with_named_sequence} { %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param // expected-error @below {{dynamic option index 0 is already used in options}} - %3 = "transform.apply_registered_pass"(%1, %2, %0) <{ + %3 = "transform.apply_registered_pass"(%0, %1, %2) <{ options = {"max-iterations" = #transform.param_operand, "max-num-rewrites" = #transform.param_operand, "test-convergence" = true, "top-down" = false}, pass_name = "canonicalize"}> - : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op "transform.yield"() : () -> () }) : () -> () }) {transform.with_named_sequence} : () -> () @@ -364,12 +363,12 @@ module attributes {transform.with_named_sequence} { %1 = "transform.param.constant"() <{value = 10 : i64}> : () -> !transform.any_param %2 = "transform.param.constant"() <{value = 1 : i64}> : () -> !transform.any_param // expected-error @below {{a param operand does not have a corresponding param_operand attr in the options dict}} - %3 = "transform.apply_registered_pass"(%1, %2, %0) <{ + %3 = "transform.apply_registered_pass"(%0, %1, %2) <{ options = {"max-iterations" = #transform.param_operand, "test-convergence" = true, "top-down" = false}, pass_name = "canonicalize"}> - : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op "transform.yield"() : () -> () }) : () -> () }) {transform.with_named_sequence} : () -> () diff --git a/mlir/test/python/dialects/transform.py b/mlir/test/python/dialects/transform.py index 48bc9bad37a1e..eeb95605d7a9a 100644 --- a/mlir/test/python/dialects/transform.py +++ b/mlir/test/python/dialects/transform.py @@ -263,12 +263,12 @@ def testApplyRegisteredPassOp(module: Module): ) with InsertionPoint(sequence.body): mod = transform.ApplyRegisteredPassOp( - transform.AnyOpType.get(), "canonicalize", sequence.bodyTarget + transform.AnyOpType.get(), sequence.bodyTarget, "canonicalize" ) mod = transform.ApplyRegisteredPassOp( transform.AnyOpType.get(), - "canonicalize", mod.result, + "canonicalize", options={"top-down": BoolAttr.get(False)}, ) max_iter = transform.param_constant( @@ -281,12 +281,12 @@ def testApplyRegisteredPassOp(module: Module): ) transform.apply_registered_pass( transform.AnyOpType.get(), - "canonicalize", mod, + "canonicalize", options={ "top-down": BoolAttr.get(False), "max-iterations": max_iter, - "test-convergence": BoolAttr.get(True), + "test-convergence": True, "max-rewrites": max_rewrites, }, ) @@ -305,4 +305,4 @@ def testApplyRegisteredPassOp(module: Module): # CHECK-SAME: "max-rewrites" = %[[MAX_REWRITE]], # CHECK-SAME: "test-convergence" = true, # CHECK-SAME: "top-down" = false} - # CHECK-SAME: to %{{.*}} : (!transform.any_param, !transform.any_param, !transform.any_op) -> !transform.any_op + # CHECK-SAME: to %{{.*}} : (!transform.any_op, !transform.any_param, !transform.any_param) -> !transform.any_op From d87eea35fac5a34a841c637db8908128409a184e Mon Sep 17 00:00:00 2001 From: lntue Date: Wed, 11 Jun 2025 16:25:27 -0400 Subject: [PATCH 148/851] [libc] Move libc_errno.h to libc/src/__support and make LIBC_ERRNO_MODE_SYSTEM to be header-only. (#143187) This is the first step in preparation for: https://discourse.llvm.org/t/rfc-make-clang-builtin-math-functions-constexpr-with-llvm-libc-to-support-c-23-constexpr-math-functions/86450 --- .../modules/LLVMLibCCompileOptionRules.cmake | 4 + libc/config/config.json | 2 +- libc/docs/dev/code_style.rst | 4 +- libc/shared/fp_bits.h | 1 + libc/shared/libc_common.h | 26 +++++ libc/shared/rpc_server.h | 1 + libc/shared/str_to_float.h | 1 + libc/shared/str_to_integer.h | 1 + libc/src/__support/CMakeLists.txt | 9 ++ libc/src/__support/FPUtil/FEnvImpl.h | 2 +- libc/src/__support/File/dir.cpp | 2 +- libc/src/__support/File/file.cpp | 2 +- libc/src/__support/File/linux/file.cpp | 2 +- libc/src/__support/File/linux/lseekImpl.h | 2 +- libc/src/__support/HashTable/randomness.h | 2 +- libc/src/__support/OSUtil/linux/fcntl.cpp | 2 +- libc/src/__support/OSUtil/linux/vdso.cpp | 2 +- .../tables/linux_extension_errors.h | 2 +- libc/src/__support/libc_errno.h | 108 ++++++++++++++++++ libc/src/__support/threads/linux/thread.cpp | 2 +- libc/src/dirent/closedir.cpp | 2 +- libc/src/dirent/opendir.cpp | 2 +- libc/src/dirent/readdir.cpp | 2 +- libc/src/errno/CMakeLists.txt | 20 +--- libc/src/errno/libc_errno.cpp | 47 +------- libc/src/errno/libc_errno.h | 47 -------- libc/src/fcntl/linux/creat.cpp | 2 +- libc/src/fcntl/linux/open.cpp | 2 +- libc/src/fcntl/linux/openat.cpp | 2 +- libc/src/inttypes/strtoimax.cpp | 2 +- libc/src/inttypes/strtoumax.cpp | 2 +- libc/src/math/generic/exp10m1f.cpp | 2 +- libc/src/math/generic/exp2m1f.cpp | 2 +- libc/src/math/generic/nan.cpp | 2 +- libc/src/math/generic/nanf.cpp | 2 +- libc/src/math/generic/nanf128.cpp | 2 +- libc/src/math/generic/nanf16.cpp | 2 +- libc/src/math/generic/nanl.cpp | 2 +- libc/src/poll/linux/poll.cpp | 2 +- libc/src/pthread/pthread_atfork.cpp | 2 +- .../pthread/pthread_attr_setdetachstate.cpp | 2 +- .../src/pthread/pthread_attr_setguardsize.cpp | 2 +- libc/src/pthread/pthread_attr_setstack.cpp | 2 +- .../src/pthread/pthread_attr_setstacksize.cpp | 2 +- .../src/pthread/pthread_condattr_setclock.cpp | 2 +- .../pthread/pthread_condattr_setpshared.cpp | 2 +- libc/src/pthread/pthread_create.cpp | 2 +- libc/src/pthread/pthread_key_create.cpp | 2 +- libc/src/pthread/pthread_key_delete.cpp | 2 +- .../pthread/pthread_mutexattr_setpshared.cpp | 2 +- .../pthread/pthread_mutexattr_setrobust.cpp | 2 +- .../src/pthread/pthread_mutexattr_settype.cpp | 2 +- .../pthread/pthread_rwlock_timedrdlock.cpp | 2 +- libc/src/pthread/pthread_rwlock_trywrlock.cpp | 2 +- libc/src/pthread/pthread_rwlock_unlock.cpp | 2 +- .../pthread/pthread_rwlockattr_setkind_np.cpp | 2 +- .../pthread/pthread_rwlockattr_setpshared.cpp | 2 +- libc/src/pthread/pthread_setspecific.cpp | 2 +- .../sched/linux/sched_get_priority_max.cpp | 2 +- .../sched/linux/sched_get_priority_min.cpp | 2 +- libc/src/sched/linux/sched_getaffinity.cpp | 2 +- libc/src/sched/linux/sched_getparam.cpp | 2 +- libc/src/sched/linux/sched_getscheduler.cpp | 2 +- .../src/sched/linux/sched_rr_get_interval.cpp | 2 +- libc/src/sched/linux/sched_setaffinity.cpp | 2 +- libc/src/sched/linux/sched_setparam.cpp | 2 +- libc/src/sched/linux/sched_setscheduler.cpp | 2 +- libc/src/sched/linux/sched_yield.cpp | 2 +- libc/src/search/hcreate.cpp | 2 +- libc/src/search/hcreate_r.cpp | 2 +- libc/src/search/hdestroy_r.cpp | 2 +- libc/src/search/hsearch.cpp | 2 +- libc/src/search/hsearch_r.cpp | 2 +- libc/src/signal/linux/kill.cpp | 2 +- libc/src/signal/linux/sigaction.cpp | 2 +- libc/src/signal/linux/sigaddset.cpp | 2 +- libc/src/signal/linux/sigaltstack.cpp | 2 +- libc/src/signal/linux/sigdelset.cpp | 2 +- libc/src/signal/linux/sigemptyset.cpp | 2 +- libc/src/signal/linux/sigfillset.cpp | 2 +- libc/src/signal/linux/sigprocmask.cpp | 2 +- .../posix_spawn_file_actions_addclose.cpp | 2 +- .../posix_spawn_file_actions_adddup2.cpp | 2 +- .../posix_spawn_file_actions_addopen.cpp | 2 +- .../posix_spawn_file_actions_destroy.cpp | 2 +- libc/src/stdio/fopencookie.cpp | 2 +- libc/src/stdio/generic/fclose.cpp | 2 +- libc/src/stdio/generic/fflush.cpp | 2 +- libc/src/stdio/generic/fgetc.cpp | 2 +- libc/src/stdio/generic/fgetc_unlocked.cpp | 2 +- libc/src/stdio/generic/fgets.cpp | 2 +- libc/src/stdio/generic/fopen.cpp | 2 +- libc/src/stdio/generic/fputc.cpp | 2 +- libc/src/stdio/generic/fputs.cpp | 2 +- libc/src/stdio/generic/fread.cpp | 2 +- libc/src/stdio/generic/fread_unlocked.cpp | 2 +- libc/src/stdio/generic/fseek.cpp | 2 +- libc/src/stdio/generic/fseeko.cpp | 2 +- libc/src/stdio/generic/ftell.cpp | 2 +- libc/src/stdio/generic/ftello.cpp | 2 +- libc/src/stdio/generic/fwrite.cpp | 2 +- libc/src/stdio/generic/fwrite_unlocked.cpp | 2 +- libc/src/stdio/generic/getc.cpp | 2 +- libc/src/stdio/generic/getc_unlocked.cpp | 2 +- libc/src/stdio/generic/getchar.cpp | 2 +- libc/src/stdio/generic/getchar_unlocked.cpp | 2 +- libc/src/stdio/generic/putc.cpp | 2 +- libc/src/stdio/generic/putchar.cpp | 2 +- libc/src/stdio/generic/puts.cpp | 2 +- libc/src/stdio/gpu/fprintf.cpp | 2 +- libc/src/stdio/gpu/printf.cpp | 2 +- libc/src/stdio/linux/fdopen.cpp | 2 +- libc/src/stdio/linux/remove.cpp | 2 +- libc/src/stdio/linux/rename.cpp | 2 +- libc/src/stdio/printf_core/parser.h | 2 +- libc/src/stdio/setbuf.cpp | 2 +- libc/src/stdio/setvbuf.cpp | 2 +- libc/src/stdlib/atof.cpp | 2 +- libc/src/stdlib/atoi.cpp | 2 +- libc/src/stdlib/atol.cpp | 2 +- libc/src/stdlib/atoll.cpp | 2 +- libc/src/stdlib/strtod.cpp | 2 +- libc/src/stdlib/strtod_l.cpp | 2 +- libc/src/stdlib/strtof.cpp | 2 +- libc/src/stdlib/strtof_l.cpp | 2 +- libc/src/stdlib/strtol.cpp | 2 +- libc/src/stdlib/strtol_l.cpp | 2 +- libc/src/stdlib/strtold.cpp | 2 +- libc/src/stdlib/strtold_l.cpp | 2 +- libc/src/stdlib/strtoll.cpp | 2 +- libc/src/stdlib/strtoll_l.cpp | 2 +- libc/src/stdlib/strtoul.cpp | 2 +- libc/src/stdlib/strtoul_l.cpp | 2 +- libc/src/stdlib/strtoull.cpp | 2 +- libc/src/stdlib/strtoull_l.cpp | 2 +- libc/src/string/strdup.cpp | 2 +- libc/src/sys/auxv/linux/getauxval.cpp | 2 +- libc/src/sys/epoll/linux/epoll_create.cpp | 2 +- libc/src/sys/epoll/linux/epoll_create1.cpp | 2 +- libc/src/sys/epoll/linux/epoll_ctl.cpp | 2 +- libc/src/sys/epoll/linux/epoll_pwait.cpp | 2 +- libc/src/sys/epoll/linux/epoll_pwait2.cpp | 2 +- libc/src/sys/epoll/linux/epoll_wait.cpp | 2 +- libc/src/sys/mman/linux/madvise.cpp | 2 +- libc/src/sys/mman/linux/mincore.cpp | 2 +- libc/src/sys/mman/linux/mlock.cpp | 2 +- libc/src/sys/mman/linux/mlock2.cpp | 2 +- libc/src/sys/mman/linux/mlockall.cpp | 2 +- libc/src/sys/mman/linux/mmap.cpp | 2 +- libc/src/sys/mman/linux/mprotect.cpp | 2 +- libc/src/sys/mman/linux/mremap.cpp | 2 +- libc/src/sys/mman/linux/msync.cpp | 2 +- libc/src/sys/mman/linux/munlock.cpp | 2 +- libc/src/sys/mman/linux/munlockall.cpp | 2 +- libc/src/sys/mman/linux/munmap.cpp | 4 +- libc/src/sys/mman/linux/remap_file_pages.cpp | 2 +- libc/src/sys/mman/linux/shm_common.h | 2 +- libc/src/sys/prctl/linux/prctl.cpp | 2 +- libc/src/sys/random/linux/getrandom.cpp | 2 +- libc/src/sys/resource/linux/getrlimit.cpp | 2 +- libc/src/sys/resource/linux/setrlimit.cpp | 2 +- libc/src/sys/select/linux/select.cpp | 2 +- libc/src/sys/sendfile/linux/sendfile.cpp | 2 +- libc/src/sys/socket/linux/bind.cpp | 2 +- libc/src/sys/socket/linux/recv.cpp | 2 +- libc/src/sys/socket/linux/recvfrom.cpp | 2 +- libc/src/sys/socket/linux/recvmsg.cpp | 2 +- libc/src/sys/socket/linux/send.cpp | 2 +- libc/src/sys/socket/linux/sendmsg.cpp | 2 +- libc/src/sys/socket/linux/sendto.cpp | 2 +- libc/src/sys/socket/linux/socket.cpp | 2 +- libc/src/sys/socket/linux/socketpair.cpp | 2 +- libc/src/sys/stat/linux/chmod.cpp | 2 +- libc/src/sys/stat/linux/fchmod.cpp | 2 +- libc/src/sys/stat/linux/fchmodat.cpp | 2 +- libc/src/sys/stat/linux/fstat.cpp | 2 +- libc/src/sys/stat/linux/lstat.cpp | 2 +- libc/src/sys/stat/linux/mkdir.cpp | 2 +- libc/src/sys/stat/linux/mkdirat.cpp | 2 +- libc/src/sys/stat/linux/stat.cpp | 2 +- libc/src/sys/statvfs/linux/statfs_utils.h | 2 +- libc/src/sys/time/linux/getitimer.cpp | 2 +- libc/src/sys/time/linux/setitimer.cpp | 2 +- libc/src/sys/time/linux/utimes.cpp | 2 +- libc/src/sys/uio/linux/readv.cpp | 2 +- libc/src/sys/uio/linux/writev.cpp | 2 +- libc/src/sys/utsname/linux/uname.cpp | 2 +- libc/src/sys/wait/wait4Impl.h | 2 +- libc/src/termios/linux/cfsetispeed.cpp | 2 +- libc/src/termios/linux/cfsetospeed.cpp | 2 +- libc/src/termios/linux/tcdrain.cpp | 2 +- libc/src/termios/linux/tcflow.cpp | 2 +- libc/src/termios/linux/tcflush.cpp | 2 +- libc/src/termios/linux/tcgetattr.cpp | 2 +- libc/src/termios/linux/tcgetsid.cpp | 2 +- libc/src/termios/linux/tcsendbreak.cpp | 2 +- libc/src/termios/linux/tcsetattr.cpp | 2 +- libc/src/threads/thrd_create.cpp | 2 +- libc/src/time/linux/clock.cpp | 2 +- libc/src/time/linux/clock_gettime.cpp | 2 +- libc/src/time/linux/gettimeofday.cpp | 2 +- libc/src/time/linux/nanosleep.cpp | 2 +- libc/src/time/linux/timespec_get.cpp | 2 +- libc/src/time/time.cpp | 2 +- libc/src/time/time_utils.h | 2 +- libc/src/time/windows/clock_getres.cpp | 2 +- libc/src/unistd/linux/access.cpp | 2 +- libc/src/unistd/linux/chdir.cpp | 2 +- libc/src/unistd/linux/close.cpp | 2 +- libc/src/unistd/linux/dup.cpp | 2 +- libc/src/unistd/linux/dup2.cpp | 2 +- libc/src/unistd/linux/dup3.cpp | 2 +- libc/src/unistd/linux/execv.cpp | 2 +- libc/src/unistd/linux/execve.cpp | 2 +- libc/src/unistd/linux/fchdir.cpp | 2 +- libc/src/unistd/linux/fork.cpp | 2 +- libc/src/unistd/linux/fsync.cpp | 2 +- libc/src/unistd/linux/ftruncate.cpp | 2 +- libc/src/unistd/linux/getcwd.cpp | 2 +- libc/src/unistd/linux/getentropy.cpp | 2 +- libc/src/unistd/linux/getsid.cpp | 2 +- libc/src/unistd/linux/isatty.cpp | 2 +- libc/src/unistd/linux/link.cpp | 2 +- libc/src/unistd/linux/linkat.cpp | 2 +- libc/src/unistd/linux/lseek.cpp | 2 +- libc/src/unistd/linux/pathconf.cpp | 2 +- libc/src/unistd/linux/pathconf_utils.cpp | 2 +- libc/src/unistd/linux/pipe.cpp | 4 +- libc/src/unistd/linux/pipe2.cpp | 2 +- libc/src/unistd/linux/pread.cpp | 6 +- libc/src/unistd/linux/pwrite.cpp | 2 +- libc/src/unistd/linux/read.cpp | 4 +- libc/src/unistd/linux/readlink.cpp | 2 +- libc/src/unistd/linux/readlinkat.cpp | 2 +- libc/src/unistd/linux/rmdir.cpp | 2 +- libc/src/unistd/linux/symlink.cpp | 2 +- libc/src/unistd/linux/symlinkat.cpp | 2 +- libc/src/unistd/linux/syscall.cpp | 2 +- libc/src/unistd/linux/sysconf.cpp | 2 +- libc/src/unistd/linux/truncate.cpp | 2 +- libc/src/unistd/linux/unlink.cpp | 2 +- libc/src/unistd/linux/unlinkat.cpp | 2 +- libc/src/unistd/linux/write.cpp | 2 +- libc/src/unistd/windows/getentropy.cpp | 2 +- libc/test/IntegrationTest/test.h | 9 +- libc/test/UnitTest/ErrnoCheckingTest.h | 4 +- libc/test/UnitTest/ErrnoSetterMatcher.h | 6 +- libc/test/UnitTest/FPMatcher.h | 8 +- libc/test/UnitTest/Test.h | 11 +- .../src/pthread/pthread_create_test.cpp | 4 +- .../src/pthread/pthread_join_test.cpp | 4 +- .../src/pthread/pthread_name_test.cpp | 2 +- .../integration/src/unistd/getcwd_test.cpp | 6 +- .../integration/startup/linux/tls_test.cpp | 2 +- libc/test/src/__support/str_to_fp_test.h | 1 + .../src/__support/str_to_integer_test.cpp | 1 + libc/test/src/dirent/dirent_test.cpp | 10 +- libc/test/src/errno/errno_test.cpp | 4 +- libc/test/src/fcntl/creat_test.cpp | 2 +- libc/test/src/fcntl/fcntl_test.cpp | 4 +- libc/test/src/fcntl/openat_test.cpp | 2 +- libc/test/src/math/RoundToIntegerTest.h | 2 +- libc/test/src/math/acosf_test.cpp | 4 +- libc/test/src/math/acoshf16_test.cpp | 2 +- libc/test/src/math/acoshf_test.cpp | 4 +- libc/test/src/math/asin_test.cpp | 2 +- libc/test/src/math/asinf_test.cpp | 4 +- libc/test/src/math/asinhf_test.cpp | 4 +- libc/test/src/math/atan2f_test.cpp | 2 +- libc/test/src/math/atan_test.cpp | 2 +- libc/test/src/math/atanf_test.cpp | 4 +- libc/test/src/math/atanhf_test.cpp | 4 +- libc/test/src/math/cosf_test.cpp | 4 +- libc/test/src/math/coshf_test.cpp | 6 +- libc/test/src/math/cospif_test.cpp | 4 +- libc/test/src/math/exp10_test.cpp | 4 +- libc/test/src/math/exp10f_test.cpp | 15 ++- libc/test/src/math/exp10m1f_test.cpp | 8 +- libc/test/src/math/exp2_test.cpp | 4 +- libc/test/src/math/exp2f_test.cpp | 15 ++- libc/test/src/math/exp2m1f_test.cpp | 9 +- libc/test/src/math/exp_test.cpp | 4 +- libc/test/src/math/expf_test.cpp | 15 ++- libc/test/src/math/expm1_test.cpp | 4 +- libc/test/src/math/expm1f_test.cpp | 15 ++- libc/test/src/math/log10_test.cpp | 4 +- libc/test/src/math/log1p_test.cpp | 4 +- libc/test/src/math/log1pf_test.cpp | 4 +- libc/test/src/math/log2_test.cpp | 4 +- libc/test/src/math/log2f_test.cpp | 7 +- libc/test/src/math/log_test.cpp | 4 +- libc/test/src/math/powf_test.cpp | 2 +- libc/test/src/math/sin_test.cpp | 2 +- libc/test/src/math/sincosf_test.cpp | 4 +- libc/test/src/math/sinf_test.cpp | 4 +- libc/test/src/math/sinhf_test.cpp | 6 +- libc/test/src/math/sinpif_test.cpp | 4 +- libc/test/src/math/smoke/FModTest.h | 2 +- libc/test/src/math/smoke/RoundToIntegerTest.h | 2 +- libc/test/src/math/smoke/acos_test.cpp | 4 +- libc/test/src/math/smoke/acosf16_test.cpp | 4 +- libc/test/src/math/smoke/acosf_test.cpp | 4 +- libc/test/src/math/smoke/acoshf16_test.cpp | 4 +- libc/test/src/math/smoke/acoshf_test.cpp | 4 +- libc/test/src/math/smoke/acospif16_test.cpp | 4 +- libc/test/src/math/smoke/asinf16_test.cpp | 4 +- libc/test/src/math/smoke/asinf_test.cpp | 4 +- libc/test/src/math/smoke/asinhf16_test.cpp | 4 +- libc/test/src/math/smoke/asinhf_test.cpp | 4 +- libc/test/src/math/smoke/atan2f_test.cpp | 4 +- libc/test/src/math/smoke/atanf16_test.cpp | 4 +- libc/test/src/math/smoke/atanf_test.cpp | 4 +- libc/test/src/math/smoke/atanhf16_test.cpp | 4 +- libc/test/src/math/smoke/atanhf_test.cpp | 4 +- libc/test/src/math/smoke/cosf16_test.cpp | 4 +- libc/test/src/math/smoke/cosf_test.cpp | 4 +- libc/test/src/math/smoke/coshf16_test.cpp | 6 +- libc/test/src/math/smoke/coshf_test.cpp | 6 +- libc/test/src/math/smoke/cospif16_test.cpp | 4 +- libc/test/src/math/smoke/cospif_test.cpp | 4 +- libc/test/src/math/smoke/exp10_test.cpp | 2 +- libc/test/src/math/smoke/exp10f16_test.cpp | 8 +- libc/test/src/math/smoke/exp10f_test.cpp | 6 +- libc/test/src/math/smoke/exp10m1f16_test.cpp | 8 +- libc/test/src/math/smoke/exp10m1f_test.cpp | 8 +- libc/test/src/math/smoke/exp2_test.cpp | 2 +- libc/test/src/math/smoke/exp2f16_test.cpp | 8 +- libc/test/src/math/smoke/exp2f_test.cpp | 6 +- libc/test/src/math/smoke/exp2m1f16_test.cpp | 8 +- libc/test/src/math/smoke/exp2m1f_test.cpp | 8 +- libc/test/src/math/smoke/exp_test.cpp | 2 +- libc/test/src/math/smoke/expf16_test.cpp | 8 +- libc/test/src/math/smoke/expf_test.cpp | 6 +- libc/test/src/math/smoke/expm1_test.cpp | 2 +- libc/test/src/math/smoke/expm1f16_test.cpp | 8 +- libc/test/src/math/smoke/expm1f_test.cpp | 6 +- libc/test/src/math/smoke/log10_test.cpp | 2 +- libc/test/src/math/smoke/log10f16_test.cpp | 4 +- libc/test/src/math/smoke/log1p_test.cpp | 2 +- libc/test/src/math/smoke/log1pf_test.cpp | 2 +- libc/test/src/math/smoke/log2_test.cpp | 2 +- libc/test/src/math/smoke/log2f16_test.cpp | 4 +- libc/test/src/math/smoke/log2f_test.cpp | 2 +- libc/test/src/math/smoke/log_test.cpp | 2 +- libc/test/src/math/smoke/logf16_test.cpp | 4 +- libc/test/src/math/smoke/sincosf_test.cpp | 4 +- libc/test/src/math/smoke/sinf16_test.cpp | 4 +- libc/test/src/math/smoke/sinf_test.cpp | 4 +- libc/test/src/math/smoke/sinhf16_test.cpp | 6 +- libc/test/src/math/smoke/sinhf_test.cpp | 6 +- libc/test/src/math/smoke/sinpif16_test.cpp | 4 +- libc/test/src/math/smoke/sinpif_test.cpp | 4 +- libc/test/src/math/smoke/tanf16_test.cpp | 4 +- libc/test/src/math/smoke/tanf_test.cpp | 4 +- libc/test/src/math/smoke/tanhf16_test.cpp | 6 +- libc/test/src/math/smoke/tanhf_test.cpp | 4 +- libc/test/src/math/smoke/tanpif16_test.cpp | 4 +- libc/test/src/math/tanf_test.cpp | 4 +- libc/test/src/math/tanhf_test.cpp | 4 +- libc/test/src/poll/poll_test.cpp | 6 +- libc/test/src/sched/affinity_test.cpp | 10 +- libc/test/src/sched/cpu_count_test.cpp | 4 +- libc/test/src/sched/get_priority_test.cpp | 4 +- .../src/sched/param_and_scheduler_test.cpp | 49 ++++---- .../src/sched/sched_rr_get_interval_test.cpp | 10 +- libc/test/src/sched/yield_test.cpp | 4 +- libc/test/src/signal/sigaltstack_test.cpp | 4 +- libc/test/src/signal/signal_test.cpp | 4 +- libc/test/src/signal/sigprocmask_test.cpp | 4 +- .../spawn/posix_spawn_file_actions_test.cpp | 2 +- libc/test/src/stdio/fdopen_test.cpp | 10 +- libc/test/src/stdio/fgetc_test.cpp | 4 +- libc/test/src/stdio/fgetc_unlocked_test.cpp | 4 +- libc/test/src/stdio/fgets_test.cpp | 4 +- libc/test/src/stdio/fileop_test.cpp | 24 ++-- libc/test/src/stdio/fopencookie_test.cpp | 10 +- libc/test/src/stdio/remove_test.cpp | 6 +- libc/test/src/stdio/rename_test.cpp | 4 +- libc/test/src/stdio/setvbuf_test.cpp | 4 +- libc/test/src/stdio/sprintf_test.cpp | 76 ++++++------ libc/test/src/stdio/unlocked_fileop_test.cpp | 6 +- libc/test/src/stdlib/StrtolTest.h | 1 + libc/test/src/stdlib/strtoint32_test.cpp | 6 +- libc/test/src/stdlib/strtoint64_test.cpp | 6 +- libc/test/src/stdlib/strtold_test.cpp | 1 + libc/test/src/sys/mman/linux/mlock_test.cpp | 17 ++- .../src/sys/statvfs/linux/fstatvfs_test.cpp | 4 +- .../src/sys/statvfs/linux/statvfs_test.cpp | 4 +- libc/test/src/sys/time/setitimer_test.cpp | 2 +- libc/test/src/termios/termios_test.cpp | 12 +- libc/test/src/time/asctime_r_test.cpp | 2 +- libc/test/src/time/asctime_test.cpp | 2 +- libc/test/src/time/ctime_r_test.cpp | 2 +- libc/test/src/time/ctime_test.cpp | 2 +- libc/test/src/time/gmtime_test.cpp | 4 +- libc/test/src/time/nanosleep_test.cpp | 4 +- .../llvm-project-overlay/libc/BUILD.bazel | 3 +- 397 files changed, 829 insertions(+), 783 deletions(-) create mode 100644 libc/shared/libc_common.h create mode 100644 libc/src/__support/libc_errno.h delete mode 100644 libc/src/errno/libc_errno.h diff --git a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake index 0facb0b9be0c1..a98e7276bef80 100644 --- a/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake +++ b/libc/cmake/modules/LLVMLibCCompileOptionRules.cmake @@ -106,6 +106,10 @@ function(_get_compile_options_from_config output_var) list(APPEND config_options "-DLIBC_MATH=${LIBC_CONF_MATH_OPTIMIZATIONS}") endif() + if(LIBC_CONF_ERRNO_MODE) + set(APPEND config_options "-DLIBC_ERRNO_MODE=${LIBC_CONF_ERRNO_MODE}") + endif() + set(${output_var} ${config_options} PARENT_SCOPE) endfunction(_get_compile_options_from_config) diff --git a/libc/config/config.json b/libc/config/config.json index bfe956855cb52..d53b2936edb07 100644 --- a/libc/config/config.json +++ b/libc/config/config.json @@ -2,7 +2,7 @@ "errno": { "LIBC_CONF_ERRNO_MODE": { "value": "LIBC_ERRNO_MODE_DEFAULT", - "doc": "The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM." + "doc": "The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE." } }, "printf": { diff --git a/libc/docs/dev/code_style.rst b/libc/docs/dev/code_style.rst index 0bd3a69ae3ffe..86247966552f9 100644 --- a/libc/docs/dev/code_style.rst +++ b/libc/docs/dev/code_style.rst @@ -101,7 +101,7 @@ test infrastructure itself can be affected. To avoid perturbing the unit test infrastructure around the setting of ``errno``, the following rules are to be followed: -#. A special macro named ``libc_errno`` defined in ``src/errno/libc_errno.h`` +#. A special macro named ``libc_errno`` defined in ``src/__support/libc_errno.h`` should be used when setting ``errno`` from libc runtime code. For example, code to set ``errno`` to ``EINVAL`` should be: @@ -117,7 +117,7 @@ followed: `ErrorOr `_ to return error values. -#. The header file ``src/errno/libc_errno.h`` is shipped as part of the target +#. The header file ``src/__support/libc_errno.h`` is shipped as part of the target corresponding to the ``errno`` entrypoint ``libc.src.errno.errno``. We do not in general allow dependencies between entrypoints. However, the ``errno`` entrypoint is the only exceptional entrypoint on which other entrypoints diff --git a/libc/shared/fp_bits.h b/libc/shared/fp_bits.h index 2898c508b7772..e6bb1e17b80c9 100644 --- a/libc/shared/fp_bits.h +++ b/libc/shared/fp_bits.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_SHARED_FP_BITS_H #define LLVM_LIBC_SHARED_FP_BITS_H +#include "libc_common.h" #include "src/__support/FPUtil/FPBits.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/shared/libc_common.h b/libc/shared/libc_common.h new file mode 100644 index 0000000000000..c4560bbb02763 --- /dev/null +++ b/libc/shared/libc_common.h @@ -0,0 +1,26 @@ +//===-- Common defines for sharing LLVM libc with LLVM projects -*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SHARED_LIBC_COMMON_H +#define LLVM_LIBC_SHARED_LIBC_COMMON_H + +// Use system errno. +#ifdef LIBC_ERRNO_MODE +#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE +#error \ + "LIBC_ERRNO_MODE was set to something different from LIBC_ERRNO_MODE_SYSTEM_INLINE." +#endif // LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE +#else +#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_SYSTEM_INLINE +#endif // LIBC_ERRNO_MODE + +#ifndef LIBC_NAMESPACE +#define LIBC_NAMESPACE __llvm_libc +#endif // LIBC_NAMESPACE + +#endif // LLVM_LIBC_SHARED_LIBC_COMMON_H diff --git a/libc/shared/rpc_server.h b/libc/shared/rpc_server.h index 5509094b944ad..46e35f13f0eac 100644 --- a/libc/shared/rpc_server.h +++ b/libc/shared/rpc_server.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_SHARED_RPC_SERVER_H #define LLVM_LIBC_SHARED_RPC_SERVER_H +#include "libc_common.h" #include "src/__support/RPC/rpc_server.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/shared/str_to_float.h b/libc/shared/str_to_float.h index b133a28e26efc..dcc6027d6c77f 100644 --- a/libc/shared/str_to_float.h +++ b/libc/shared/str_to_float.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_SHARED_STR_TO_FLOAT_H #define LLVM_LIBC_SHARED_STR_TO_FLOAT_H +#include "libc_common.h" #include "src/__support/str_to_float.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/shared/str_to_integer.h b/libc/shared/str_to_integer.h index 15bee698d5a6b..6ed38c932662e 100644 --- a/libc/shared/str_to_integer.h +++ b/libc/shared/str_to_integer.h @@ -9,6 +9,7 @@ #ifndef LLVM_LIBC_SHARED_STR_TO_INTEGER_H #define LLVM_LIBC_SHARED_STR_TO_INTEGER_H +#include "libc_common.h" #include "src/__support/str_to_integer.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/__support/CMakeLists.txt b/libc/src/__support/CMakeLists.txt index f92499fdbf451..327ff5e0c6a37 100644 --- a/libc/src/__support/CMakeLists.txt +++ b/libc/src/__support/CMakeLists.txt @@ -1,6 +1,15 @@ add_subdirectory(CPP) add_subdirectory(macros) +add_header_library( + libc_errno + HDRS + libc_errno.h + DEPENDS + libc.hdr.errno_macros + libc.src.__support.macros.config +) + add_header_library( block HDRS diff --git a/libc/src/__support/FPUtil/FEnvImpl.h b/libc/src/__support/FPUtil/FEnvImpl.h index 4c8f34a435bdf..50a101f833c55 100644 --- a/libc/src/__support/FPUtil/FEnvImpl.h +++ b/libc/src/__support/FPUtil/FEnvImpl.h @@ -12,10 +12,10 @@ #include "hdr/fenv_macros.h" #include "hdr/math_macros.h" #include "hdr/types/fenv_t.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/attributes.h" // LIBC_INLINE #include "src/__support/macros/config.h" #include "src/__support/macros/properties/architectures.h" -#include "src/errno/libc_errno.h" #if defined(LIBC_TARGET_ARCH_IS_AARCH64) && defined(__ARM_FP) #if defined(__APPLE__) diff --git a/libc/src/__support/File/dir.cpp b/libc/src/__support/File/dir.cpp index 21b0106f70106..aea8862c15f7f 100644 --- a/libc/src/__support/File/dir.cpp +++ b/libc/src/__support/File/dir.cpp @@ -11,8 +11,8 @@ #include "src/__support/CPP/mutex.h" // lock_guard #include "src/__support/CPP/new.h" #include "src/__support/error_or.h" +#include "src/__support/libc_errno.h" // For error macros #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" // For error macros namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/__support/File/file.cpp b/libc/src/__support/File/file.cpp index 528542cccf324..303852dbbb717 100644 --- a/libc/src/__support/File/file.cpp +++ b/libc/src/__support/File/file.cpp @@ -13,8 +13,8 @@ #include "hdr/types/off_t.h" #include "src/__support/CPP/new.h" #include "src/__support/CPP/span.h" +#include "src/__support/libc_errno.h" // For error macros #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" // For error macros namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/__support/File/linux/file.cpp b/libc/src/__support/File/linux/file.cpp index 824c1f200e8c5..761e352f74ead 100644 --- a/libc/src/__support/File/linux/file.cpp +++ b/libc/src/__support/File/linux/file.cpp @@ -15,8 +15,8 @@ #include "src/__support/File/linux/lseekImpl.h" #include "src/__support/OSUtil/fcntl.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" // For error macros #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" // For error macros #include "hdr/fcntl_macros.h" // For mode_t and other flags to the open syscall #include // For S_IS*, S_IF*, and S_IR* flags. diff --git a/libc/src/__support/File/linux/lseekImpl.h b/libc/src/__support/File/linux/lseekImpl.h index a034913d9f6ec..300e5c5dd55bf 100644 --- a/libc/src/__support/File/linux/lseekImpl.h +++ b/libc/src/__support/File/linux/lseekImpl.h @@ -13,8 +13,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" #include "src/__support/error_or.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For uint64_t. #include // For syscall numbers. diff --git a/libc/src/__support/HashTable/randomness.h b/libc/src/__support/HashTable/randomness.h index 244dd41be3eec..6b58a4125f785 100644 --- a/libc/src/__support/HashTable/randomness.h +++ b/libc/src/__support/HashTable/randomness.h @@ -14,7 +14,7 @@ #include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" #if defined(LIBC_HASHTABLE_USE_GETRANDOM) -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sys/random/getrandom.h" #endif diff --git a/libc/src/__support/OSUtil/linux/fcntl.cpp b/libc/src/__support/OSUtil/linux/fcntl.cpp index 4742b2a00220b..99e16ad58c918 100644 --- a/libc/src/__support/OSUtil/linux/fcntl.cpp +++ b/libc/src/__support/OSUtil/linux/fcntl.cpp @@ -15,8 +15,8 @@ #include "hdr/types/struct_flock64.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/__support/OSUtil/linux/vdso.cpp b/libc/src/__support/OSUtil/linux/vdso.cpp index 8c9bd3e1bcc72..e4e53c3c2a0f2 100644 --- a/libc/src/__support/OSUtil/linux/vdso.cpp +++ b/libc/src/__support/OSUtil/linux/vdso.cpp @@ -11,9 +11,9 @@ #include "src/__support/CPP/array.h" #include "src/__support/CPP/optional.h" #include "src/__support/CPP/string_view.h" +#include "src/__support/libc_errno.h" #include "src/__support/threads/callonce.h" #include "src/__support/threads/linux/futex_word.h" -#include "src/errno/libc_errno.h" #include "src/sys/auxv/getauxval.h" #include diff --git a/libc/src/__support/StringUtil/tables/linux_extension_errors.h b/libc/src/__support/StringUtil/tables/linux_extension_errors.h index 425590f6e91c9..de637d60bea97 100644 --- a/libc/src/__support/StringUtil/tables/linux_extension_errors.h +++ b/libc/src/__support/StringUtil/tables/linux_extension_errors.h @@ -10,8 +10,8 @@ #define LLVM_LIBC_SRC___SUPPORT_STRINGUTIL_TABLES_LINUX_EXTENSION_ERRORS_H #include "src/__support/StringUtil/message_mapper.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/__support/libc_errno.h b/libc/src/__support/libc_errno.h new file mode 100644 index 0000000000000..ab5f6a9c4b9d9 --- /dev/null +++ b/libc/src/__support/libc_errno.h @@ -0,0 +1,108 @@ +//===-- Implementation header for libc_errno --------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIBC_SRC___SUPPORT_LIBC_ERRNO_H +#define LLVM_LIBC_SRC___SUPPORT_LIBC_ERRNO_H + +// This header is to be consumed by internal implementations, in which all of +// them should refer to `libc_errno` instead of using `errno` directly from +// header. + +// Unit and hermetic tests should: +// - #include "src/__support/libc_errno.h" +// - NOT #include +// - Only use `libc_errno` in the code +// - Depend on libc.src.errno.errno + +// Integration tests should: +// - NOT #include "src/__support/libc_errno.h" +// - #include +// - Use regular `errno` in the code +// - Still depend on libc.src.errno.errno + +// libc uses a fallback default value, either system or thread local. +#define LIBC_ERRNO_MODE_DEFAULT 0 +// libc never stores a value; `errno` macro uses get link-time failure. +#define LIBC_ERRNO_MODE_UNDEFINED 1 +// libc maintains per-thread state (requires C++ `thread_local` support). +#define LIBC_ERRNO_MODE_THREAD_LOCAL 2 +// libc maintains shared state used by all threads, contrary to standard C +// semantics unless always single-threaded; nothing prevents data races. +#define LIBC_ERRNO_MODE_SHARED 3 +// libc doesn't maintain any internal state, instead the embedder must define +// `int *__llvm_libc_errno(void);` C function. +#define LIBC_ERRNO_MODE_EXTERNAL 4 +// libc uses system `` `errno` macro directly in the overlay mode; in +// fullbuild mode, effectively the same as `LIBC_ERRNO_MODE_EXTERNAL`. +// In this mode, the public C++ symbol `LIBC_NAMESPACE::libc_errno ` is still +// exported and get redirected to the system `errno` inside its implementation. + +// TODO: Investigate deprecating LIBC_ERRNO_MODE_SYSTEM in favor of +// LIBC_ERRNO_MODE_SYSTEM_INLINE. +// https://github.com/llvm/llvm-project/issues/143454 +#define LIBC_ERRNO_MODE_SYSTEM 5 +// In this mode, the libc_errno is simply a macro resolved to `errno` from the +// system header . There is no need to link against the +// `libc.src.errno.errno` object. +#define LIBC_ERRNO_MODE_SYSTEM_INLINE 6 + +#if !defined(LIBC_ERRNO_MODE) || LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_DEFAULT +#undef LIBC_ERRNO_MODE +#if defined(LIBC_FULL_BUILD) || !defined(LIBC_COPT_PUBLIC_PACKAGING) +#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_THREAD_LOCAL +#else +#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_SYSTEM +#endif +#endif // LIBC_ERRNO_MODE + +#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_DEFAULT && \ + LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_UNDEFINED && \ + LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_THREAD_LOCAL && \ + LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SHARED && \ + LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_EXTERNAL && \ + LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM && \ + LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE +#error LIBC_ERRNO_MODE must be one of the following values: \ +LIBC_ERRNO_MODE_DEFAULT, \ +LIBC_ERRNO_MODE_UNDEFINED, \ +LIBC_ERRNO_MODE_THREAD_LOCAL, \ +LIBC_ERRNO_MODE_SHARED, \ +LIBC_ERRNO_MODE_EXTERNAL, \ +LIBC_ERRNO_MODE_SYSTEM, \ +LIBC_ERRNO_MODE_SYSTEM_INLINE. +#endif + +#if LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_SYSTEM_INLINE + +#include + +#define libc_errno errno + +#else // !LIBC_ERRNO_MODE_SYSTEM_INLINE + +#include "hdr/errno_macros.h" +#include "src/__support/macros/config.h" + +namespace LIBC_NAMESPACE_DECL { + +extern "C" int *__llvm_libc_errno() noexcept; + +struct Errno { + void operator=(int); + operator int(); +}; + +extern Errno libc_errno; + +} // namespace LIBC_NAMESPACE_DECL + +using LIBC_NAMESPACE::libc_errno; + +#endif // LIBC_ERRNO_MODE_SYSTEM_INLINE + +#endif // LLVM_LIBC_SRC___SUPPORT_LIBC_ERRNO_H diff --git a/libc/src/__support/threads/linux/thread.cpp b/libc/src/__support/threads/linux/thread.cpp index c531d74c53355..baad26aed6851 100644 --- a/libc/src/__support/threads/linux/thread.cpp +++ b/libc/src/__support/threads/linux/thread.cpp @@ -14,9 +14,9 @@ #include "src/__support/OSUtil/syscall.h" // For syscall functions. #include "src/__support/common.h" #include "src/__support/error_or.h" +#include "src/__support/libc_errno.h" // For error macros #include "src/__support/macros/config.h" #include "src/__support/threads/linux/futex_utils.h" // For FutexWordType -#include "src/errno/libc_errno.h" // For error macros #ifdef LIBC_TARGET_ARCH_IS_AARCH64 #include diff --git a/libc/src/dirent/closedir.cpp b/libc/src/dirent/closedir.cpp index 1249ef94cf411..2f8f6f0c044db 100644 --- a/libc/src/dirent/closedir.cpp +++ b/libc/src/dirent/closedir.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/dir.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/dirent/opendir.cpp b/libc/src/dirent/opendir.cpp index fee14ef0f558d..bf47d0edac180 100644 --- a/libc/src/dirent/opendir.cpp +++ b/libc/src/dirent/opendir.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/dir.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/dirent/readdir.cpp b/libc/src/dirent/readdir.cpp index ad460b5e80b8b..f95f7c1ae8646 100644 --- a/libc/src/dirent/readdir.cpp +++ b/libc/src/dirent/readdir.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/dir.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/errno/CMakeLists.txt b/libc/src/errno/CMakeLists.txt index 1d78a5eedff96..2852044e94164 100644 --- a/libc/src/errno/CMakeLists.txt +++ b/libc/src/errno/CMakeLists.txt @@ -1,28 +1,16 @@ # If we are in full build mode, we will provide the errno definition ourselves, # and if we are in overlay mode, we will just re-use the system's errno. -# We are passing LIBC_FULL_BUILD flag in full build mode so that the -# implementation of libc_errno will know if we are in full build mode or not. - -# TODO: Move LIBC_FULL_BUILD flag to _get_common_compile_options. -set(full_build_flag "") -if(LLVM_LIBC_FULL_BUILD) - set(full_build_flag "-DLIBC_FULL_BUILD") -endif() - -if(LIBC_CONF_ERRNO_MODE) - set(errno_config_copts "-DLIBC_ERRNO_MODE=${LIBC_CONF_ERRNO_MODE}") -endif() add_entrypoint_object( errno SRCS libc_errno.cpp HDRS - libc_errno.h # Include this - COMPILE_OPTIONS - ${full_build_flag} - ${errno_config_copts} + ../__support/libc_errno.h DEPENDS libc.hdr.errno_macros libc.src.__support.common + libc.src.__support.libc_errno + libc.src.__support.macros.attributes + libc.src.__support.macros.config ) diff --git a/libc/src/errno/libc_errno.cpp b/libc/src/errno/libc_errno.cpp index d1600d1b050e3..8ff1eec1b1035 100644 --- a/libc/src/errno/libc_errno.cpp +++ b/libc/src/errno/libc_errno.cpp @@ -6,51 +6,14 @@ // //===----------------------------------------------------------------------===// -#include "libc_errno.h" +#include "src/__support/libc_errno.h" +#include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" -// libc uses a fallback default value, either system or thread local. -#define LIBC_ERRNO_MODE_DEFAULT 0 -// libc never stores a value; `errno` macro uses get link-time failure. -#define LIBC_ERRNO_MODE_UNDEFINED 1 -// libc maintains per-thread state (requires C++ `thread_local` support). -#define LIBC_ERRNO_MODE_THREAD_LOCAL 2 -// libc maintains shared state used by all threads, contrary to standard C -// semantics unless always single-threaded; nothing prevents data races. -#define LIBC_ERRNO_MODE_SHARED 3 -// libc doesn't maintain any internal state, instead the embedder must define -// `int *__llvm_libc_errno(void);` C function. -#define LIBC_ERRNO_MODE_EXTERNAL 4 -// libc uses system `` `errno` macro directly in the overlay mode; in -// fullbuild mode, effectively the same as `LIBC_ERRNO_MODE_EXTERNAL`. -#define LIBC_ERRNO_MODE_SYSTEM 5 - -#if !defined(LIBC_ERRNO_MODE) || LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_DEFAULT -#undef LIBC_ERRNO_MODE -#if defined(LIBC_FULL_BUILD) || !defined(LIBC_COPT_PUBLIC_PACKAGING) -#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_THREAD_LOCAL -#else -#define LIBC_ERRNO_MODE LIBC_ERRNO_MODE_SYSTEM -#endif -#endif // LIBC_ERRNO_MODE - -#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_DEFAULT && \ - LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_UNDEFINED && \ - LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_THREAD_LOCAL && \ - LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SHARED && \ - LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_EXTERNAL && \ - LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM -#error LIBC_ERRNO_MODE must be one of the following values: \ -LIBC_ERRNO_MODE_DEFAULT, \ -LIBC_ERRNO_MODE_UNDEFINED, \ -LIBC_ERRNO_MODE_THREAD_LOCAL, \ -LIBC_ERRNO_MODE_SHARED, \ -LIBC_ERRNO_MODE_EXTERNAL, \ -LIBC_ERRNO_MODE_SYSTEM -#endif - namespace LIBC_NAMESPACE_DECL { +#if LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE + #if LIBC_ERRNO_MODE == LIBC_ERRNO_MODE_UNDEFINED void Errno::operator=(int) {} @@ -93,4 +56,6 @@ Errno::operator int() { return errno; } // Define the global `libc_errno` instance. Errno libc_errno; +#endif // LIBC_ERRNO_MODE != LIBC_ERRNO_MODE_SYSTEM_INLINE + } // namespace LIBC_NAMESPACE_DECL diff --git a/libc/src/errno/libc_errno.h b/libc/src/errno/libc_errno.h deleted file mode 100644 index 44ee2714843ba..0000000000000 --- a/libc/src/errno/libc_errno.h +++ /dev/null @@ -1,47 +0,0 @@ -//===-- Implementation header for libc_errno --------------------*- C++ -*-===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBC_SRC_ERRNO_LIBC_ERRNO_H -#define LLVM_LIBC_SRC_ERRNO_LIBC_ERRNO_H - -#include "src/__support/macros/attributes.h" -#include "src/__support/macros/config.h" -#include "src/__support/macros/properties/architectures.h" - -#include "hdr/errno_macros.h" - -// This header is to be consumed by internal implementations, in which all of -// them should refer to `libc_errno` instead of using `errno` directly from -// header. - -// Unit and hermetic tests should: -// - #include "src/errno/libc_errno.h" -// - NOT #include -// - Only use `libc_errno` in the code -// - Depend on libc.src.errno.errno - -// Integration tests should: -// - NOT #include "src/errno/libc_errno.h" -// - #include -// - Use regular `errno` in the code -// - Still depend on libc.src.errno.errno - -namespace LIBC_NAMESPACE_DECL { - -extern "C" int *__llvm_libc_errno() noexcept; - -struct Errno { - void operator=(int); - operator int(); -}; - -extern Errno libc_errno; - -} // namespace LIBC_NAMESPACE_DECL - -#endif // LLVM_LIBC_SRC_ERRNO_LIBC_ERRNO_H diff --git a/libc/src/fcntl/linux/creat.cpp b/libc/src/fcntl/linux/creat.cpp index 23abae243aed9..71412a8e68c53 100644 --- a/libc/src/fcntl/linux/creat.cpp +++ b/libc/src/fcntl/linux/creat.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/fcntl_macros.h" #include // For syscall numbers. diff --git a/libc/src/fcntl/linux/open.cpp b/libc/src/fcntl/linux/open.cpp index 8b699ecdd2043..a21a03788deaa 100644 --- a/libc/src/fcntl/linux/open.cpp +++ b/libc/src/fcntl/linux/open.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/fcntl_macros.h" #include "hdr/types/mode_t.h" diff --git a/libc/src/fcntl/linux/openat.cpp b/libc/src/fcntl/linux/openat.cpp index 6063d9c00ad6c..b47ad1fb3bb0f 100644 --- a/libc/src/fcntl/linux/openat.cpp +++ b/libc/src/fcntl/linux/openat.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/types/mode_t.h" #include diff --git a/libc/src/inttypes/strtoimax.cpp b/libc/src/inttypes/strtoimax.cpp index 85f197c75d90c..6e55a4b56aac7 100644 --- a/libc/src/inttypes/strtoimax.cpp +++ b/libc/src/inttypes/strtoimax.cpp @@ -8,9 +8,9 @@ #include "src/inttypes/strtoimax.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/inttypes/strtoumax.cpp b/libc/src/inttypes/strtoumax.cpp index 2e9cbc9acba75..ce5a0a782d979 100644 --- a/libc/src/inttypes/strtoumax.cpp +++ b/libc/src/inttypes/strtoumax.cpp @@ -8,9 +8,9 @@ #include "src/inttypes/strtoumax.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/math/generic/exp10m1f.cpp b/libc/src/math/generic/exp10m1f.cpp index e973b2921c2e4..27729104e038d 100644 --- a/libc/src/math/generic/exp10m1f.cpp +++ b/libc/src/math/generic/exp10m1f.cpp @@ -14,9 +14,9 @@ #include "src/__support/FPUtil/multiply_add.h" #include "src/__support/FPUtil/rounding_mode.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" -#include "src/errno/libc_errno.h" #include "explogxf.h" diff --git a/libc/src/math/generic/exp2m1f.cpp b/libc/src/math/generic/exp2m1f.cpp index 4913a5e4277e4..127c6eaa494d4 100644 --- a/libc/src/math/generic/exp2m1f.cpp +++ b/libc/src/math/generic/exp2m1f.cpp @@ -14,10 +14,10 @@ #include "src/__support/FPUtil/multiply_add.h" #include "src/__support/FPUtil/rounding_mode.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" #include "src/__support/macros/properties/cpu_features.h" -#include "src/errno/libc_errno.h" #include "explogxf.h" diff --git a/libc/src/math/generic/nan.cpp b/libc/src/math/generic/nan.cpp index f92cd3ff5eb50..829a2ea435ac0 100644 --- a/libc/src/math/generic/nan.cpp +++ b/libc/src/math/generic/nan.cpp @@ -8,9 +8,9 @@ #include "src/math/nan.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/math/generic/nanf.cpp b/libc/src/math/generic/nanf.cpp index 7287182406acd..1cb66160e736e 100644 --- a/libc/src/math/generic/nanf.cpp +++ b/libc/src/math/generic/nanf.cpp @@ -8,9 +8,9 @@ #include "src/math/nanf.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/math/generic/nanf128.cpp b/libc/src/math/generic/nanf128.cpp index 3d8581afa0371..4155c5333a9c2 100644 --- a/libc/src/math/generic/nanf128.cpp +++ b/libc/src/math/generic/nanf128.cpp @@ -8,9 +8,9 @@ #include "src/math/nanf128.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/math/generic/nanf16.cpp b/libc/src/math/generic/nanf16.cpp index 27d9d165f4a85..7b166400601bc 100644 --- a/libc/src/math/generic/nanf16.cpp +++ b/libc/src/math/generic/nanf16.cpp @@ -8,9 +8,9 @@ #include "src/math/nanf16.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/math/generic/nanl.cpp b/libc/src/math/generic/nanl.cpp index 4f698cb3c88d0..58d638c4b531d 100644 --- a/libc/src/math/generic/nanl.cpp +++ b/libc/src/math/generic/nanl.cpp @@ -8,9 +8,9 @@ #include "src/math/nanl.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/poll/linux/poll.cpp b/libc/src/poll/linux/poll.cpp index f82fcbcc6577c..4cac75b9687c8 100644 --- a/libc/src/poll/linux/poll.cpp +++ b/libc/src/poll/linux/poll.cpp @@ -13,8 +13,8 @@ #include "hdr/types/struct_timespec.h" #include "src/__support/OSUtil/syscall.h" // syscall_impl #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // SYS_poll, SYS_ppoll diff --git a/libc/src/pthread/pthread_atfork.cpp b/libc/src/pthread/pthread_atfork.cpp index b2c67c78e5d94..4cad16a02de70 100644 --- a/libc/src/pthread/pthread_atfork.cpp +++ b/libc/src/pthread/pthread_atfork.cpp @@ -9,9 +9,9 @@ #include "pthread_atfork.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/fork_callbacks.h" -#include "src/errno/libc_errno.h" #include // For pthread_* type definitions. diff --git a/libc/src/pthread/pthread_attr_setdetachstate.cpp b/libc/src/pthread/pthread_attr_setdetachstate.cpp index 872f694e01f3a..c482d25610c28 100644 --- a/libc/src/pthread/pthread_attr_setdetachstate.cpp +++ b/libc/src/pthread/pthread_attr_setdetachstate.cpp @@ -9,8 +9,8 @@ #include "pthread_attr_setdetachstate.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_attr_setguardsize.cpp b/libc/src/pthread/pthread_attr_setguardsize.cpp index fa4375e915ab4..c996210a61d8a 100644 --- a/libc/src/pthread/pthread_attr_setguardsize.cpp +++ b/libc/src/pthread/pthread_attr_setguardsize.cpp @@ -9,8 +9,8 @@ #include "pthread_attr_setguardsize.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For EXEC_PAGESIZE. #include diff --git a/libc/src/pthread/pthread_attr_setstack.cpp b/libc/src/pthread/pthread_attr_setstack.cpp index 1154055a63a7e..767f959b14003 100644 --- a/libc/src/pthread/pthread_attr_setstack.cpp +++ b/libc/src/pthread/pthread_attr_setstack.cpp @@ -10,9 +10,9 @@ #include "pthread_attr_setstacksize.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/thread.h" // For STACK_ALIGNMENT -#include "src/errno/libc_errno.h" #include #include diff --git a/libc/src/pthread/pthread_attr_setstacksize.cpp b/libc/src/pthread/pthread_attr_setstacksize.cpp index 0a5d1af661abf..38c77ca761d69 100644 --- a/libc/src/pthread/pthread_attr_setstacksize.cpp +++ b/libc/src/pthread/pthread_attr_setstacksize.cpp @@ -9,8 +9,8 @@ #include "pthread_attr_setstacksize.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_condattr_setclock.cpp b/libc/src/pthread/pthread_condattr_setclock.cpp index 5e825d5ecea69..2f63d5e9d1942 100644 --- a/libc/src/pthread/pthread_condattr_setclock.cpp +++ b/libc/src/pthread/pthread_condattr_setclock.cpp @@ -9,8 +9,8 @@ #include "pthread_condattr_setclock.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/time_macros.h" // CLOCK_MONOTONIC, CLOCK_REALTIME #include // pthread_condattr_t diff --git a/libc/src/pthread/pthread_condattr_setpshared.cpp b/libc/src/pthread/pthread_condattr_setpshared.cpp index 433b2dc1d2d93..9c117499a5592 100644 --- a/libc/src/pthread/pthread_condattr_setpshared.cpp +++ b/libc/src/pthread/pthread_condattr_setpshared.cpp @@ -9,8 +9,8 @@ #include "pthread_condattr_setpshared.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // pthread_condattr_t, PTHREAD_PROCESS_SHARED, PTHREAD_PROCESS_PRIVATE diff --git a/libc/src/pthread/pthread_create.cpp b/libc/src/pthread/pthread_create.cpp index e1b1f3b325d1c..45be2807fa832 100644 --- a/libc/src/pthread/pthread_create.cpp +++ b/libc/src/pthread/pthread_create.cpp @@ -16,10 +16,10 @@ #include "pthread_attr_getstack.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" #include "src/__support/threads/thread.h" -#include "src/errno/libc_errno.h" #include // For pthread_* type definitions. diff --git a/libc/src/pthread/pthread_key_create.cpp b/libc/src/pthread/pthread_key_create.cpp index 383762f273e7a..7253de14cc0d5 100644 --- a/libc/src/pthread/pthread_key_create.cpp +++ b/libc/src/pthread/pthread_key_create.cpp @@ -9,9 +9,9 @@ #include "pthread_key_create.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/thread.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_key_delete.cpp b/libc/src/pthread/pthread_key_delete.cpp index b54db821ab05a..2b14d874fe31c 100644 --- a/libc/src/pthread/pthread_key_delete.cpp +++ b/libc/src/pthread/pthread_key_delete.cpp @@ -9,9 +9,9 @@ #include "pthread_key_delete.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/thread.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_mutexattr_setpshared.cpp b/libc/src/pthread/pthread_mutexattr_setpshared.cpp index deeae15be2303..a87a08259c4bb 100644 --- a/libc/src/pthread/pthread_mutexattr_setpshared.cpp +++ b/libc/src/pthread/pthread_mutexattr_setpshared.cpp @@ -10,8 +10,8 @@ #include "pthread_mutexattr.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_mutexattr_setrobust.cpp b/libc/src/pthread/pthread_mutexattr_setrobust.cpp index 9fd46f4c928d7..fd7a8d7ce1d17 100644 --- a/libc/src/pthread/pthread_mutexattr_setrobust.cpp +++ b/libc/src/pthread/pthread_mutexattr_setrobust.cpp @@ -10,8 +10,8 @@ #include "pthread_mutexattr.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_mutexattr_settype.cpp b/libc/src/pthread/pthread_mutexattr_settype.cpp index c7e78271f9c38..5a65f031045d6 100644 --- a/libc/src/pthread/pthread_mutexattr_settype.cpp +++ b/libc/src/pthread/pthread_mutexattr_settype.cpp @@ -10,8 +10,8 @@ #include "pthread_mutexattr.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_rwlock_timedrdlock.cpp b/libc/src/pthread/pthread_rwlock_timedrdlock.cpp index 112ff5c9cdad3..fcddfed224906 100644 --- a/libc/src/pthread/pthread_rwlock_timedrdlock.cpp +++ b/libc/src/pthread/pthread_rwlock_timedrdlock.cpp @@ -9,11 +9,11 @@ #include "src/pthread/pthread_rwlock_timedrdlock.h" #include "src/__support/common.h" #include "src/__support/libc_assert.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/optimization.h" #include "src/__support/threads/linux/rwlock.h" #include "src/__support/time/linux/abs_timeout.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_rwlock_trywrlock.cpp b/libc/src/pthread/pthread_rwlock_trywrlock.cpp index a63dc893e7169..660c15a87b36c 100644 --- a/libc/src/pthread/pthread_rwlock_trywrlock.cpp +++ b/libc/src/pthread/pthread_rwlock_trywrlock.cpp @@ -9,9 +9,9 @@ #include "src/pthread/pthread_rwlock_trywrlock.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/linux/rwlock.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_rwlock_unlock.cpp b/libc/src/pthread/pthread_rwlock_unlock.cpp index e61290179bd62..5496bea929c51 100644 --- a/libc/src/pthread/pthread_rwlock_unlock.cpp +++ b/libc/src/pthread/pthread_rwlock_unlock.cpp @@ -9,9 +9,9 @@ #include "src/pthread/pthread_rwlock_unlock.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/linux/rwlock.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp b/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp index 80d34a35c717a..e6800311b8587 100644 --- a/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp +++ b/libc/src/pthread/pthread_rwlockattr_setkind_np.cpp @@ -9,8 +9,8 @@ #include "pthread_rwlockattr_setkind_np.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // pthread_rwlockattr_t diff --git a/libc/src/pthread/pthread_rwlockattr_setpshared.cpp b/libc/src/pthread/pthread_rwlockattr_setpshared.cpp index 5a7191aefd3d0..4fbd095ac2b46 100644 --- a/libc/src/pthread/pthread_rwlockattr_setpshared.cpp +++ b/libc/src/pthread/pthread_rwlockattr_setpshared.cpp @@ -9,8 +9,8 @@ #include "pthread_rwlockattr_setpshared.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // pthread_rwlockattr_t, PTHREAD_PROCESS_SHARED, PTHREAD_PROCESS_PRIVATE diff --git a/libc/src/pthread/pthread_setspecific.cpp b/libc/src/pthread/pthread_setspecific.cpp index 70c29c1670841..b147a66d2fad7 100644 --- a/libc/src/pthread/pthread_setspecific.cpp +++ b/libc/src/pthread/pthread_setspecific.cpp @@ -9,9 +9,9 @@ #include "pthread_setspecific.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/thread.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/sched/linux/sched_get_priority_max.cpp b/libc/src/sched/linux/sched_get_priority_max.cpp index 77a82c77405f3..fb30b1e319e7b 100644 --- a/libc/src/sched/linux/sched_get_priority_max.cpp +++ b/libc/src/sched/linux/sched_get_priority_max.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_get_priority_min.cpp b/libc/src/sched/linux/sched_get_priority_min.cpp index fca66a15edb55..54f67e915fc17 100644 --- a/libc/src/sched/linux/sched_get_priority_min.cpp +++ b/libc/src/sched/linux/sched_get_priority_min.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_getaffinity.cpp b/libc/src/sched/linux/sched_getaffinity.cpp index 7b1fd8c5aa2af..e005819e2a978 100644 --- a/libc/src/sched/linux/sched_getaffinity.cpp +++ b/libc/src/sched/linux/sched_getaffinity.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include diff --git a/libc/src/sched/linux/sched_getparam.cpp b/libc/src/sched/linux/sched_getparam.cpp index 75756a65f0ede..b0576c3ac65b8 100644 --- a/libc/src/sched/linux/sched_getparam.cpp +++ b/libc/src/sched/linux/sched_getparam.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_getscheduler.cpp b/libc/src/sched/linux/sched_getscheduler.cpp index 545cda8e7484b..d8e02967a633d 100644 --- a/libc/src/sched/linux/sched_getscheduler.cpp +++ b/libc/src/sched/linux/sched_getscheduler.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_rr_get_interval.cpp b/libc/src/sched/linux/sched_rr_get_interval.cpp index 1f0ef69dfc893..5668d596bce1f 100644 --- a/libc/src/sched/linux/sched_rr_get_interval.cpp +++ b/libc/src/sched/linux/sched_rr_get_interval.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_setaffinity.cpp b/libc/src/sched/linux/sched_setaffinity.cpp index cad48c26bf938..93e930dcf2e3e 100644 --- a/libc/src/sched/linux/sched_setaffinity.cpp +++ b/libc/src/sched/linux/sched_setaffinity.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_setparam.cpp b/libc/src/sched/linux/sched_setparam.cpp index e78e78a707e05..7875d9e2f19bc 100644 --- a/libc/src/sched/linux/sched_setparam.cpp +++ b/libc/src/sched/linux/sched_setparam.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_setscheduler.cpp b/libc/src/sched/linux/sched_setscheduler.cpp index b6b6f667b3f9e..232e5a59b1858 100644 --- a/libc/src/sched/linux/sched_setscheduler.cpp +++ b/libc/src/sched/linux/sched_setscheduler.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sched/linux/sched_yield.cpp b/libc/src/sched/linux/sched_yield.cpp index 3de9d0ba35717..c1e9168f34d0e 100644 --- a/libc/src/sched/linux/sched_yield.cpp +++ b/libc/src/sched/linux/sched_yield.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/search/hcreate.cpp b/libc/src/search/hcreate.cpp index ac816a902e221..68bdb29e51dfb 100644 --- a/libc/src/search/hcreate.cpp +++ b/libc/src/search/hcreate.cpp @@ -9,8 +9,8 @@ #include "src/search/hcreate.h" #include "src/__support/HashTable/randomness.h" #include "src/__support/HashTable/table.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/search/hsearch/global.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/search/hcreate_r.cpp b/libc/src/search/hcreate_r.cpp index 17acd808c19a6..c89be803b4e16 100644 --- a/libc/src/search/hcreate_r.cpp +++ b/libc/src/search/hcreate_r.cpp @@ -9,8 +9,8 @@ #include "src/search/hcreate_r.h" #include "src/__support/HashTable/randomness.h" #include "src/__support/HashTable/table.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, hcreate_r, diff --git a/libc/src/search/hdestroy_r.cpp b/libc/src/search/hdestroy_r.cpp index 7eff5bb6fff9d..ba5476098be29 100644 --- a/libc/src/search/hdestroy_r.cpp +++ b/libc/src/search/hdestroy_r.cpp @@ -8,8 +8,8 @@ #include "src/search/hdestroy_r.h" #include "src/__support/HashTable/table.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(void, hdestroy_r, (struct hsearch_data * htab)) { diff --git a/libc/src/search/hsearch.cpp b/libc/src/search/hsearch.cpp index c18b5d3d7f547..034333d170579 100644 --- a/libc/src/search/hsearch.cpp +++ b/libc/src/search/hsearch.cpp @@ -9,8 +9,8 @@ #include "src/search/hsearch.h" #include "src/__support/HashTable/randomness.h" #include "src/__support/HashTable/table.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/search/hsearch/global.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/search/hsearch_r.cpp b/libc/src/search/hsearch_r.cpp index f93e608a190b1..323001e1b103d 100644 --- a/libc/src/search/hsearch_r.cpp +++ b/libc/src/search/hsearch_r.cpp @@ -8,8 +8,8 @@ #include "src/search/hsearch_r.h" #include "src/__support/HashTable/table.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { LLVM_LIBC_FUNCTION(int, hsearch_r, diff --git a/libc/src/signal/linux/kill.cpp b/libc/src/signal/linux/kill.cpp index ed117858f51ef..0f5e88757acb8 100644 --- a/libc/src/signal/linux/kill.cpp +++ b/libc/src/signal/linux/kill.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" #include diff --git a/libc/src/signal/linux/sigaction.cpp b/libc/src/signal/linux/sigaction.cpp index 65ec36741683c..43a3e195474e5 100644 --- a/libc/src/signal/linux/sigaction.cpp +++ b/libc/src/signal/linux/sigaction.cpp @@ -10,8 +10,8 @@ #include "hdr/types/sigset_t.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/signal/linux/sigaddset.cpp b/libc/src/signal/linux/sigaddset.cpp index 628883e13b887..2091e8b51453f 100644 --- a/libc/src/signal/linux/sigaddset.cpp +++ b/libc/src/signal/linux/sigaddset.cpp @@ -10,8 +10,8 @@ #include "hdr/types/sigset_t.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/signal/linux/sigaltstack.cpp b/libc/src/signal/linux/sigaltstack.cpp index c19394cd17912..990b841c6d904 100644 --- a/libc/src/signal/linux/sigaltstack.cpp +++ b/libc/src/signal/linux/sigaltstack.cpp @@ -8,8 +8,8 @@ #include "src/signal/sigaltstack.h" #include "hdr/types/stack_t.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" #include "src/__support/common.h" diff --git a/libc/src/signal/linux/sigdelset.cpp b/libc/src/signal/linux/sigdelset.cpp index 2e964051ebde7..6fce0d7a6e147 100644 --- a/libc/src/signal/linux/sigdelset.cpp +++ b/libc/src/signal/linux/sigdelset.cpp @@ -10,8 +10,8 @@ #include "hdr/types/sigset_t.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/signal/linux/sigemptyset.cpp b/libc/src/signal/linux/sigemptyset.cpp index d347477695e6c..034a9e2cbe15e 100644 --- a/libc/src/signal/linux/sigemptyset.cpp +++ b/libc/src/signal/linux/sigemptyset.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/signal/sigemptyset.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" #include "src/__support/common.h" diff --git a/libc/src/signal/linux/sigfillset.cpp b/libc/src/signal/linux/sigfillset.cpp index 3e9897a03bb73..f0b499093b319 100644 --- a/libc/src/signal/linux/sigfillset.cpp +++ b/libc/src/signal/linux/sigfillset.cpp @@ -10,8 +10,8 @@ #include "hdr/types/sigset_t.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/signal/linux/sigprocmask.cpp b/libc/src/signal/linux/sigprocmask.cpp index 8838379ae5d30..af3c424c5f34e 100644 --- a/libc/src/signal/linux/sigprocmask.cpp +++ b/libc/src/signal/linux/sigprocmask.cpp @@ -11,8 +11,8 @@ #include "hdr/types/sigset_t.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/signal/linux/signal_utils.h" #include // For syscall numbers. diff --git a/libc/src/spawn/posix_spawn_file_actions_addclose.cpp b/libc/src/spawn/posix_spawn_file_actions_addclose.cpp index bb8504f655c4a..9a575bd591632 100644 --- a/libc/src/spawn/posix_spawn_file_actions_addclose.cpp +++ b/libc/src/spawn/posix_spawn_file_actions_addclose.cpp @@ -11,8 +11,8 @@ #include "file_actions.h" #include "src/__support/CPP/new.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp b/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp index 710063d52e74d..1ad45ed942bb9 100644 --- a/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp +++ b/libc/src/spawn/posix_spawn_file_actions_adddup2.cpp @@ -11,8 +11,8 @@ #include "file_actions.h" #include "src/__support/CPP/new.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/spawn/posix_spawn_file_actions_addopen.cpp b/libc/src/spawn/posix_spawn_file_actions_addopen.cpp index 028d6e895f3c4..9977fc2d0a218 100644 --- a/libc/src/spawn/posix_spawn_file_actions_addopen.cpp +++ b/libc/src/spawn/posix_spawn_file_actions_addopen.cpp @@ -11,8 +11,8 @@ #include "file_actions.h" #include "src/__support/CPP/new.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/spawn/posix_spawn_file_actions_destroy.cpp b/libc/src/spawn/posix_spawn_file_actions_destroy.cpp index 168118da249d1..affd338005cf4 100644 --- a/libc/src/spawn/posix_spawn_file_actions_destroy.cpp +++ b/libc/src/spawn/posix_spawn_file_actions_destroy.cpp @@ -12,8 +12,8 @@ #include "src/__support/CPP/new.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/stdio/fopencookie.cpp b/libc/src/stdio/fopencookie.cpp index 9f5694e8e0581..da8a132a4db6e 100644 --- a/libc/src/stdio/fopencookie.cpp +++ b/libc/src/stdio/fopencookie.cpp @@ -14,8 +14,8 @@ #include "src/__support/CPP/new.h" #include "src/__support/File/file.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fclose.cpp b/libc/src/stdio/generic/fclose.cpp index 388407a58d414..902b4cf972373 100644 --- a/libc/src/stdio/generic/fclose.cpp +++ b/libc/src/stdio/generic/fclose.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fflush.cpp b/libc/src/stdio/generic/fflush.cpp index 5bdf71ad35940..d0271d9154c87 100644 --- a/libc/src/stdio/generic/fflush.cpp +++ b/libc/src/stdio/generic/fflush.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fgetc.cpp b/libc/src/stdio/generic/fgetc.cpp index aa6660ca180cf..e65ce2fda49bd 100644 --- a/libc/src/stdio/generic/fgetc.cpp +++ b/libc/src/stdio/generic/fgetc.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fgetc_unlocked.cpp b/libc/src/stdio/generic/fgetc_unlocked.cpp index 34a27f1d1c420..5c07d4feb513e 100644 --- a/libc/src/stdio/generic/fgetc_unlocked.cpp +++ b/libc/src/stdio/generic/fgetc_unlocked.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fgets.cpp b/libc/src/stdio/generic/fgets.cpp index de6474087a140..e0ad9b6e2f564 100644 --- a/libc/src/stdio/generic/fgets.cpp +++ b/libc/src/stdio/generic/fgets.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fopen.cpp b/libc/src/stdio/generic/fopen.cpp index d6e418bacf37e..57c85c2e54e16 100644 --- a/libc/src/stdio/generic/fopen.cpp +++ b/libc/src/stdio/generic/fopen.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fputc.cpp b/libc/src/stdio/generic/fputc.cpp index 54a38aeb2f1e2..6639f0687c87a 100644 --- a/libc/src/stdio/generic/fputc.cpp +++ b/libc/src/stdio/generic/fputc.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fputs.cpp b/libc/src/stdio/generic/fputs.cpp index 8aef7683b3ce3..621b40f63c912 100644 --- a/libc/src/stdio/generic/fputs.cpp +++ b/libc/src/stdio/generic/fputs.cpp @@ -11,8 +11,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fread.cpp b/libc/src/stdio/generic/fread.cpp index 3a04094ea8b4b..1b576ec34688f 100644 --- a/libc/src/stdio/generic/fread.cpp +++ b/libc/src/stdio/generic/fread.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fread_unlocked.cpp b/libc/src/stdio/generic/fread_unlocked.cpp index 151f43c6bbeba..257f1a212add4 100644 --- a/libc/src/stdio/generic/fread_unlocked.cpp +++ b/libc/src/stdio/generic/fread_unlocked.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fseek.cpp b/libc/src/stdio/generic/fseek.cpp index 21820da18542a..99191e7c41949 100644 --- a/libc/src/stdio/generic/fseek.cpp +++ b/libc/src/stdio/generic/fseek.cpp @@ -9,8 +9,8 @@ #include "src/stdio/fseek.h" #include "src/__support/File/file.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fseeko.cpp b/libc/src/stdio/generic/fseeko.cpp index 7456b4a219079..afcfc71c7c09a 100644 --- a/libc/src/stdio/generic/fseeko.cpp +++ b/libc/src/stdio/generic/fseeko.cpp @@ -9,8 +9,8 @@ #include "src/stdio/fseeko.h" #include "src/__support/File/file.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/ftell.cpp b/libc/src/stdio/generic/ftell.cpp index ec15ca4e96caf..b55a806007aff 100644 --- a/libc/src/stdio/generic/ftell.cpp +++ b/libc/src/stdio/generic/ftell.cpp @@ -9,8 +9,8 @@ #include "src/stdio/ftell.h" #include "src/__support/File/file.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/ftello.cpp b/libc/src/stdio/generic/ftello.cpp index e3d0726ec4843..91031cb7fad70 100644 --- a/libc/src/stdio/generic/ftello.cpp +++ b/libc/src/stdio/generic/ftello.cpp @@ -9,8 +9,8 @@ #include "src/stdio/ftello.h" #include "src/__support/File/file.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fwrite.cpp b/libc/src/stdio/generic/fwrite.cpp index 66eb9a3c71855..b44ecb2838118 100644 --- a/libc/src/stdio/generic/fwrite.cpp +++ b/libc/src/stdio/generic/fwrite.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/fwrite_unlocked.cpp b/libc/src/stdio/generic/fwrite_unlocked.cpp index a0d9014cd68de..2f9ec26f2f80c 100644 --- a/libc/src/stdio/generic/fwrite_unlocked.cpp +++ b/libc/src/stdio/generic/fwrite_unlocked.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/getc.cpp b/libc/src/stdio/generic/getc.cpp index e988468898c53..0ac010ebc5994 100644 --- a/libc/src/stdio/generic/getc.cpp +++ b/libc/src/stdio/generic/getc.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/getc_unlocked.cpp b/libc/src/stdio/generic/getc_unlocked.cpp index 92d5092623ac5..eee23a18d05df 100644 --- a/libc/src/stdio/generic/getc_unlocked.cpp +++ b/libc/src/stdio/generic/getc_unlocked.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/getchar.cpp b/libc/src/stdio/generic/getchar.cpp index 371fc70eb214f..87d24a2b1f09e 100644 --- a/libc/src/stdio/generic/getchar.cpp +++ b/libc/src/stdio/generic/getchar.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/getchar_unlocked.cpp b/libc/src/stdio/generic/getchar_unlocked.cpp index b898f5cb25963..f321969483e35 100644 --- a/libc/src/stdio/generic/getchar_unlocked.cpp +++ b/libc/src/stdio/generic/getchar_unlocked.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/putc.cpp b/libc/src/stdio/generic/putc.cpp index b5f008fdce44a..83bc3d4131e76 100644 --- a/libc/src/stdio/generic/putc.cpp +++ b/libc/src/stdio/generic/putc.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/putchar.cpp b/libc/src/stdio/generic/putchar.cpp index e86df23d6716b..2b3509e5e414c 100644 --- a/libc/src/stdio/generic/putchar.cpp +++ b/libc/src/stdio/generic/putchar.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/generic/puts.cpp b/libc/src/stdio/generic/puts.cpp index 7dbe2c79f920d..4267dd546c4dc 100644 --- a/libc/src/stdio/generic/puts.cpp +++ b/libc/src/stdio/generic/puts.cpp @@ -11,8 +11,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/gpu/fprintf.cpp b/libc/src/stdio/gpu/fprintf.cpp index 5b8f01d7d5346..9877817d92099 100644 --- a/libc/src/stdio/gpu/fprintf.cpp +++ b/libc/src/stdio/gpu/fprintf.cpp @@ -12,7 +12,7 @@ #include "src/__support/CPP/string_view.h" #include "src/__support/arg_list.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/stdio/gpu/vfprintf_utils.h" #include diff --git a/libc/src/stdio/gpu/printf.cpp b/libc/src/stdio/gpu/printf.cpp index 53fe69d5e2ebe..8a9174d7397ae 100644 --- a/libc/src/stdio/gpu/printf.cpp +++ b/libc/src/stdio/gpu/printf.cpp @@ -11,7 +11,7 @@ #include "src/__support/CPP/string_view.h" #include "src/__support/arg_list.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/stdio/gpu/vfprintf_utils.h" #include diff --git a/libc/src/stdio/linux/fdopen.cpp b/libc/src/stdio/linux/fdopen.cpp index 7d72fdc88e9fb..5623f06b7cff0 100644 --- a/libc/src/stdio/linux/fdopen.cpp +++ b/libc/src/stdio/linux/fdopen.cpp @@ -9,8 +9,8 @@ #include "src/stdio/fdopen.h" #include "src/__support/File/linux/file.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/linux/remove.cpp b/libc/src/stdio/linux/remove.cpp index dbb4491d0e6cc..ac755db0bc781 100644 --- a/libc/src/stdio/linux/remove.cpp +++ b/libc/src/stdio/linux/remove.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" // For AT_* macros. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/linux/rename.cpp b/libc/src/stdio/linux/rename.cpp index fbcb29be48f4e..426c8698e557d 100644 --- a/libc/src/stdio/linux/rename.cpp +++ b/libc/src/stdio/linux/rename.cpp @@ -10,8 +10,8 @@ #include "hdr/fcntl_macros.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/printf_core/parser.h b/libc/src/stdio/printf_core/parser.h index 89556f1a9e5f2..cef9b1ae58fa0 100644 --- a/libc/src/stdio/printf_core/parser.h +++ b/libc/src/stdio/printf_core/parser.h @@ -25,7 +25,7 @@ #include "src/__support/fixed_point/fx_rep.h" #endif // LIBC_INTERNAL_PRINTF_HAS_FIXED_POINT #ifndef LIBC_COPT_PRINTF_DISABLE_STRERROR -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #endif // LIBC_COPT_PRINTF_DISABLE_STRERROR namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/setbuf.cpp b/libc/src/stdio/setbuf.cpp index f3db97de58371..fcc6df12ddb08 100644 --- a/libc/src/stdio/setbuf.cpp +++ b/libc/src/stdio/setbuf.cpp @@ -9,8 +9,8 @@ #include "src/stdio/setbuf.h" #include "hdr/stdio_macros.h" #include "src/__support/File/file.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdio/setvbuf.cpp b/libc/src/stdio/setvbuf.cpp index 0a6b8cacb59c8..9fc6cb040233b 100644 --- a/libc/src/stdio/setvbuf.cpp +++ b/libc/src/stdio/setvbuf.cpp @@ -10,8 +10,8 @@ #include "src/__support/File/file.h" #include "hdr/types/FILE.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/atof.cpp b/libc/src/stdlib/atof.cpp index 18a65c67705d3..d0d8d211dea8c 100644 --- a/libc/src/stdlib/atof.cpp +++ b/libc/src/stdlib/atof.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/atof.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/atoi.cpp b/libc/src/stdlib/atoi.cpp index 9e46b53b1aa0b..420bbc8143d55 100644 --- a/libc/src/stdlib/atoi.cpp +++ b/libc/src/stdlib/atoi.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/atoi.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/atol.cpp b/libc/src/stdlib/atol.cpp index 7f3414a4afdd2..e1110ffa449b0 100644 --- a/libc/src/stdlib/atol.cpp +++ b/libc/src/stdlib/atol.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/atol.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/atoll.cpp b/libc/src/stdlib/atoll.cpp index 4f1a02ad8315b..063e817f9b790 100644 --- a/libc/src/stdlib/atoll.cpp +++ b/libc/src/stdlib/atoll.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/atoll.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtod.cpp b/libc/src/stdlib/strtod.cpp index 2c6819163aa46..deb2390c7fcde 100644 --- a/libc/src/stdlib/strtod.cpp +++ b/libc/src/stdlib/strtod.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtod.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtod_l.cpp b/libc/src/stdlib/strtod_l.cpp index 247314398315b..ad333b32d2406 100644 --- a/libc/src/stdlib/strtod_l.cpp +++ b/libc/src/stdlib/strtod_l.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtod_l.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtof.cpp b/libc/src/stdlib/strtof.cpp index 351bf64ad4f70..fc52dc85ffc50 100644 --- a/libc/src/stdlib/strtof.cpp +++ b/libc/src/stdlib/strtof.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtof.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtof_l.cpp b/libc/src/stdlib/strtof_l.cpp index d54efa66e0846..c6e03ff51fa2f 100644 --- a/libc/src/stdlib/strtof_l.cpp +++ b/libc/src/stdlib/strtof_l.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtof_l.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtol.cpp b/libc/src/stdlib/strtol.cpp index 77f8712d7c136..42db36b2052b4 100644 --- a/libc/src/stdlib/strtol.cpp +++ b/libc/src/stdlib/strtol.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtol.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtol_l.cpp b/libc/src/stdlib/strtol_l.cpp index f94aff1a0d7b2..497a4403eff4b 100644 --- a/libc/src/stdlib/strtol_l.cpp +++ b/libc/src/stdlib/strtol_l.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtol_l.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtold.cpp b/libc/src/stdlib/strtold.cpp index 88d29c9f36278..44046c2c6f613 100644 --- a/libc/src/stdlib/strtold.cpp +++ b/libc/src/stdlib/strtold.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtold.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtold_l.cpp b/libc/src/stdlib/strtold_l.cpp index d0c57f50246b5..c3af30a1b9ecc 100644 --- a/libc/src/stdlib/strtold_l.cpp +++ b/libc/src/stdlib/strtold_l.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtold_l.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtoll.cpp b/libc/src/stdlib/strtoll.cpp index 8d1b3efdcf87d..c1dca13112e0f 100644 --- a/libc/src/stdlib/strtoll.cpp +++ b/libc/src/stdlib/strtoll.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtoll.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtoll_l.cpp b/libc/src/stdlib/strtoll_l.cpp index e82971d59c48d..6f30d7794c5ca 100644 --- a/libc/src/stdlib/strtoll_l.cpp +++ b/libc/src/stdlib/strtoll_l.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtoll_l.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtoul.cpp b/libc/src/stdlib/strtoul.cpp index 1d832318c4489..d26ca5e5a10a1 100644 --- a/libc/src/stdlib/strtoul.cpp +++ b/libc/src/stdlib/strtoul.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtoul.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtoul_l.cpp b/libc/src/stdlib/strtoul_l.cpp index 74fce00a0ac3c..9a875ddee9029 100644 --- a/libc/src/stdlib/strtoul_l.cpp +++ b/libc/src/stdlib/strtoul_l.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtoul_l.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtoull.cpp b/libc/src/stdlib/strtoull.cpp index dba22611cfb09..8f929f577311e 100644 --- a/libc/src/stdlib/strtoull.cpp +++ b/libc/src/stdlib/strtoull.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtoull.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/stdlib/strtoull_l.cpp b/libc/src/stdlib/strtoull_l.cpp index 2ea8a43a40ef2..9eb056b0e59b4 100644 --- a/libc/src/stdlib/strtoull_l.cpp +++ b/libc/src/stdlib/strtoull_l.cpp @@ -8,9 +8,9 @@ #include "src/stdlib/strtoull_l.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/string/strdup.cpp b/libc/src/string/strdup.cpp index 4cf4173a27bf3..dab0ab4288c9e 100644 --- a/libc/src/string/strdup.cpp +++ b/libc/src/string/strdup.cpp @@ -8,8 +8,8 @@ #include "src/string/strdup.h" #include "hdr/stdlib_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/string/allocating_string_utils.h" #include "src/string/memory_utils/inline_memcpy.h" diff --git a/libc/src/sys/auxv/linux/getauxval.cpp b/libc/src/sys/auxv/linux/getauxval.cpp index 236fd25698f65..f3ae7c5c4e07a 100644 --- a/libc/src/sys/auxv/linux/getauxval.cpp +++ b/libc/src/sys/auxv/linux/getauxval.cpp @@ -9,8 +9,8 @@ #include "src/sys/auxv/getauxval.h" #include "config/app.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // for guarded initialization diff --git a/libc/src/sys/epoll/linux/epoll_create.cpp b/libc/src/sys/epoll/linux/epoll_create.cpp index 7196ac7410c30..2e44e883ddf0a 100644 --- a/libc/src/sys/epoll/linux/epoll_create.cpp +++ b/libc/src/sys/epoll/linux/epoll_create.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/epoll/linux/epoll_create1.cpp b/libc/src/sys/epoll/linux/epoll_create1.cpp index efff282e2714d..3c60090fb7b41 100644 --- a/libc/src/sys/epoll/linux/epoll_create1.cpp +++ b/libc/src/sys/epoll/linux/epoll_create1.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/epoll/linux/epoll_ctl.cpp b/libc/src/sys/epoll/linux/epoll_ctl.cpp index 5f7dbb77b1e5b..079bd60403b09 100644 --- a/libc/src/sys/epoll/linux/epoll_ctl.cpp +++ b/libc/src/sys/epoll/linux/epoll_ctl.cpp @@ -11,8 +11,8 @@ #include "hdr/types/struct_epoll_event.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/epoll/linux/epoll_pwait.cpp b/libc/src/sys/epoll/linux/epoll_pwait.cpp index d7836549928c4..24fd1dbdc467d 100644 --- a/libc/src/sys/epoll/linux/epoll_pwait.cpp +++ b/libc/src/sys/epoll/linux/epoll_pwait.cpp @@ -13,9 +13,9 @@ #include "hdr/types/struct_epoll_event.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/sanitizer.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sys/epoll/linux/epoll_pwait2.cpp b/libc/src/sys/epoll/linux/epoll_pwait2.cpp index 14b419399fe9b..219984528efdd 100644 --- a/libc/src/sys/epoll/linux/epoll_pwait2.cpp +++ b/libc/src/sys/epoll/linux/epoll_pwait2.cpp @@ -14,9 +14,9 @@ #include "hdr/types/struct_timespec.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/sanitizer.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sys/epoll/linux/epoll_wait.cpp b/libc/src/sys/epoll/linux/epoll_wait.cpp index 1a63be5e260fb..7fae7b55992fa 100644 --- a/libc/src/sys/epoll/linux/epoll_wait.cpp +++ b/libc/src/sys/epoll/linux/epoll_wait.cpp @@ -13,9 +13,9 @@ #include "hdr/types/struct_epoll_event.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/sanitizer.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/sys/mman/linux/madvise.cpp b/libc/src/sys/mman/linux/madvise.cpp index 332d6c2db4acb..1bb284f62b892 100644 --- a/libc/src/sys/mman/linux/madvise.cpp +++ b/libc/src/sys/mman/linux/madvise.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/mincore.cpp b/libc/src/sys/mman/linux/mincore.cpp index b5436fda3853a..d583f1ef85f3d 100644 --- a/libc/src/sys/mman/linux/mincore.cpp +++ b/libc/src/sys/mman/linux/mincore.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/mlock.cpp b/libc/src/sys/mman/linux/mlock.cpp index be7eb28e29c4f..8582eb7c00632 100644 --- a/libc/src/sys/mman/linux/mlock.cpp +++ b/libc/src/sys/mman/linux/mlock.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/mlock2.cpp b/libc/src/sys/mman/linux/mlock2.cpp index 7bc557f9bf58f..955cfe128de74 100644 --- a/libc/src/sys/mman/linux/mlock2.cpp +++ b/libc/src/sys/mman/linux/mlock2.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/mlockall.cpp b/libc/src/sys/mman/linux/mlockall.cpp index eae3a9ea0a183..c3502fbb3af39 100644 --- a/libc/src/sys/mman/linux/mlockall.cpp +++ b/libc/src/sys/mman/linux/mlockall.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/mmap.cpp b/libc/src/sys/mman/linux/mmap.cpp index ee9a0a32e8f55..33f9fe8ff3709 100644 --- a/libc/src/sys/mman/linux/mmap.cpp +++ b/libc/src/sys/mman/linux/mmap.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For EXEC_PAGESIZE. #include // For syscall numbers. diff --git a/libc/src/sys/mman/linux/mprotect.cpp b/libc/src/sys/mman/linux/mprotect.cpp index e2351028e2c7f..6b14915b60c94 100644 --- a/libc/src/sys/mman/linux/mprotect.cpp +++ b/libc/src/sys/mman/linux/mprotect.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/mremap.cpp b/libc/src/sys/mman/linux/mremap.cpp index 38bcfce833d3d..6cdda9435bb69 100644 --- a/libc/src/sys/mman/linux/mremap.cpp +++ b/libc/src/sys/mman/linux/mremap.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For EXEC_PAGESIZE. #include #include // For syscall numbers. diff --git a/libc/src/sys/mman/linux/msync.cpp b/libc/src/sys/mman/linux/msync.cpp index e2b4f81d616ad..650678bcb36e0 100644 --- a/libc/src/sys/mman/linux/msync.cpp +++ b/libc/src/sys/mman/linux/msync.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/munlock.cpp b/libc/src/sys/mman/linux/munlock.cpp index 93c25f844c6e8..9638949f5fcb3 100644 --- a/libc/src/sys/mman/linux/munlock.cpp +++ b/libc/src/sys/mman/linux/munlock.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/munlockall.cpp b/libc/src/sys/mman/linux/munlockall.cpp index f5911cb01bc28..f47eaece178e3 100644 --- a/libc/src/sys/mman/linux/munlockall.cpp +++ b/libc/src/sys/mman/linux/munlockall.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/munmap.cpp b/libc/src/sys/mman/linux/munmap.cpp index 9c01b15ac8dc2..61b1f1549dd18 100644 --- a/libc/src/sys/mman/linux/munmap.cpp +++ b/libc/src/sys/mman/linux/munmap.cpp @@ -11,9 +11,9 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" -#include // For syscall numbers. +#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/remap_file_pages.cpp b/libc/src/sys/mman/linux/remap_file_pages.cpp index f616e1915ecc5..58ae4017f6285 100644 --- a/libc/src/sys/mman/linux/remap_file_pages.cpp +++ b/libc/src/sys/mman/linux/remap_file_pages.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/mman/linux/shm_common.h b/libc/src/sys/mman/linux/shm_common.h index ce75c2b5b6991..69911012ff7e9 100644 --- a/libc/src/sys/mman/linux/shm_common.h +++ b/libc/src/sys/mman/linux/shm_common.h @@ -9,8 +9,8 @@ #include "src/__support/CPP/array.h" #include "src/__support/CPP/optional.h" #include "src/__support/CPP/string_view.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/string/memory_utils/inline_memcpy.h" // TODO: Get PATH_MAX via https://github.com/llvm/llvm-project/issues/85121 diff --git a/libc/src/sys/prctl/linux/prctl.cpp b/libc/src/sys/prctl/linux/prctl.cpp index 5d4e9046b8777..c726b0a539591 100644 --- a/libc/src/sys/prctl/linux/prctl.cpp +++ b/libc/src/sys/prctl/linux/prctl.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/random/linux/getrandom.cpp b/libc/src/sys/random/linux/getrandom.cpp index 9a8869a2d6d38..0b8471ed8b374 100644 --- a/libc/src/sys/random/linux/getrandom.cpp +++ b/libc/src/sys/random/linux/getrandom.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/resource/linux/getrlimit.cpp b/libc/src/sys/resource/linux/getrlimit.cpp index 30c2e91b036d1..d272134194949 100644 --- a/libc/src/sys/resource/linux/getrlimit.cpp +++ b/libc/src/sys/resource/linux/getrlimit.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For struct rlimit #include // For syscall numbers. diff --git a/libc/src/sys/resource/linux/setrlimit.cpp b/libc/src/sys/resource/linux/setrlimit.cpp index 85f07900aaef4..300bad75baa63 100644 --- a/libc/src/sys/resource/linux/setrlimit.cpp +++ b/libc/src/sys/resource/linux/setrlimit.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For struct rlimit #include // For syscall numbers. diff --git a/libc/src/sys/select/linux/select.cpp b/libc/src/sys/select/linux/select.cpp index 9ccb1e95f275c..6c434eb584596 100644 --- a/libc/src/sys/select/linux/select.cpp +++ b/libc/src/sys/select/linux/select.cpp @@ -13,8 +13,8 @@ #include "src/__support/CPP/limits.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For size_t #include // For syscall numbers. diff --git a/libc/src/sys/sendfile/linux/sendfile.cpp b/libc/src/sys/sendfile/linux/sendfile.cpp index 9d4174cb8c916..ec892323def50 100644 --- a/libc/src/sys/sendfile/linux/sendfile.cpp +++ b/libc/src/sys/sendfile/linux/sendfile.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/sys/socket/linux/bind.cpp b/libc/src/sys/socket/linux/bind.cpp index 72a3307a91ddd..83a3d06f5380b 100644 --- a/libc/src/sys/socket/linux/bind.cpp +++ b/libc/src/sys/socket/linux/bind.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For SYS_SOCKET socketcall number. #include // For syscall numbers. diff --git a/libc/src/sys/socket/linux/recv.cpp b/libc/src/sys/socket/linux/recv.cpp index 5e9f2d3233fcf..baf4de1b5eb54 100644 --- a/libc/src/sys/socket/linux/recv.cpp +++ b/libc/src/sys/socket/linux/recv.cpp @@ -16,8 +16,8 @@ #include "hdr/types/struct_sockaddr.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/sanitizer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/socket/linux/recvfrom.cpp b/libc/src/sys/socket/linux/recvfrom.cpp index 574e65f64a54b..3d8397b478cc4 100644 --- a/libc/src/sys/socket/linux/recvfrom.cpp +++ b/libc/src/sys/socket/linux/recvfrom.cpp @@ -16,8 +16,8 @@ #include "hdr/types/struct_sockaddr.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/sanitizer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/socket/linux/recvmsg.cpp b/libc/src/sys/socket/linux/recvmsg.cpp index e42b6346f330a..bc6d072dbf9a1 100644 --- a/libc/src/sys/socket/linux/recvmsg.cpp +++ b/libc/src/sys/socket/linux/recvmsg.cpp @@ -15,8 +15,8 @@ #include "hdr/types/struct_msghdr.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/sanitizer.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/socket/linux/send.cpp b/libc/src/sys/socket/linux/send.cpp index cb3b4d5a9ece7..43b01e7e6e0f6 100644 --- a/libc/src/sys/socket/linux/send.cpp +++ b/libc/src/sys/socket/linux/send.cpp @@ -16,7 +16,7 @@ #include "hdr/types/struct_sockaddr.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/socket/linux/sendmsg.cpp b/libc/src/sys/socket/linux/sendmsg.cpp index b4d9c9deda028..b04783ebfe7e7 100644 --- a/libc/src/sys/socket/linux/sendmsg.cpp +++ b/libc/src/sys/socket/linux/sendmsg.cpp @@ -15,7 +15,7 @@ #include "hdr/types/struct_msghdr.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/socket/linux/sendto.cpp b/libc/src/sys/socket/linux/sendto.cpp index 2fada192b0865..9dda127f872d5 100644 --- a/libc/src/sys/socket/linux/sendto.cpp +++ b/libc/src/sys/socket/linux/sendto.cpp @@ -16,7 +16,7 @@ #include "hdr/types/struct_sockaddr.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/socket/linux/socket.cpp b/libc/src/sys/socket/linux/socket.cpp index 3e6df4d487a53..69eb6cfa01ced 100644 --- a/libc/src/sys/socket/linux/socket.cpp +++ b/libc/src/sys/socket/linux/socket.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For SYS_SOCKET socketcall number. #include // For syscall numbers. diff --git a/libc/src/sys/socket/linux/socketpair.cpp b/libc/src/sys/socket/linux/socketpair.cpp index 60612ac04d613..7ea8ca46cee58 100644 --- a/libc/src/sys/socket/linux/socketpair.cpp +++ b/libc/src/sys/socket/linux/socketpair.cpp @@ -10,9 +10,9 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/sanitizer.h" -#include "src/errno/libc_errno.h" #include // For SYS_SOCKET socketcall number. #include // For syscall numbers. diff --git a/libc/src/sys/stat/linux/chmod.cpp b/libc/src/sys/stat/linux/chmod.cpp index 1b787e47e7c68..2bd0788ec1dfd 100644 --- a/libc/src/sys/stat/linux/chmod.cpp +++ b/libc/src/sys/stat/linux/chmod.cpp @@ -13,8 +13,8 @@ #include "hdr/fcntl_macros.h" #include "hdr/types/mode_t.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/sys/stat/linux/fchmod.cpp b/libc/src/sys/stat/linux/fchmod.cpp index 0d6fd359169aa..3dadfdd1d943c 100644 --- a/libc/src/sys/stat/linux/fchmod.cpp +++ b/libc/src/sys/stat/linux/fchmod.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/types/mode_t.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/sys/stat/linux/fchmodat.cpp b/libc/src/sys/stat/linux/fchmodat.cpp index e76db4d160fb8..add2192a558a4 100644 --- a/libc/src/sys/stat/linux/fchmodat.cpp +++ b/libc/src/sys/stat/linux/fchmodat.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/sys/stat/linux/fstat.cpp b/libc/src/sys/stat/linux/fstat.cpp index 35cf8f08f782d..dea002c5e12a5 100644 --- a/libc/src/sys/stat/linux/fstat.cpp +++ b/libc/src/sys/stat/linux/fstat.cpp @@ -8,8 +8,8 @@ #include "src/sys/stat/fstat.h" #include "kernel_statx.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/__support/common.h" diff --git a/libc/src/sys/stat/linux/lstat.cpp b/libc/src/sys/stat/linux/lstat.cpp index 354c5b6e029a4..5601dd5d78a98 100644 --- a/libc/src/sys/stat/linux/lstat.cpp +++ b/libc/src/sys/stat/linux/lstat.cpp @@ -8,8 +8,8 @@ #include "src/sys/stat/lstat.h" #include "kernel_statx.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" diff --git a/libc/src/sys/stat/linux/mkdir.cpp b/libc/src/sys/stat/linux/mkdir.cpp index b319b5c8393de..0829ff4f94322 100644 --- a/libc/src/sys/stat/linux/mkdir.cpp +++ b/libc/src/sys/stat/linux/mkdir.cpp @@ -13,8 +13,8 @@ #include "hdr/fcntl_macros.h" #include "hdr/types/mode_t.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/sys/stat/linux/mkdirat.cpp b/libc/src/sys/stat/linux/mkdirat.cpp index 097fc158010d1..8f4194dc32752 100644 --- a/libc/src/sys/stat/linux/mkdirat.cpp +++ b/libc/src/sys/stat/linux/mkdirat.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/sys/stat/linux/stat.cpp b/libc/src/sys/stat/linux/stat.cpp index de9cdb197d687..5553eaf00be2a 100644 --- a/libc/src/sys/stat/linux/stat.cpp +++ b/libc/src/sys/stat/linux/stat.cpp @@ -8,8 +8,8 @@ #include "src/sys/stat/stat.h" #include "kernel_statx.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/__support/common.h" diff --git a/libc/src/sys/statvfs/linux/statfs_utils.h b/libc/src/sys/statvfs/linux/statfs_utils.h index 1e5be51531012..8ee4de288ef61 100644 --- a/libc/src/sys/statvfs/linux/statfs_utils.h +++ b/libc/src/sys/statvfs/linux/statfs_utils.h @@ -12,9 +12,9 @@ #include "include/llvm-libc-types/struct_statvfs.h" #include "src/__support/CPP/optional.h" #include "src/__support/OSUtil/syscall.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/attributes.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/time/linux/getitimer.cpp b/libc/src/sys/time/linux/getitimer.cpp index fec06aa4086e9..b874066796940 100644 --- a/libc/src/sys/time/linux/getitimer.cpp +++ b/libc/src/sys/time/linux/getitimer.cpp @@ -10,7 +10,7 @@ #include "hdr/types/struct_itimerval.h" #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/time/linux/setitimer.cpp b/libc/src/sys/time/linux/setitimer.cpp index def04a4740118..1de0d43297760 100644 --- a/libc/src/sys/time/linux/setitimer.cpp +++ b/libc/src/sys/time/linux/setitimer.cpp @@ -9,7 +9,7 @@ #include "hdr/types/struct_itimerval.h" #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/time/linux/utimes.cpp b/libc/src/sys/time/linux/utimes.cpp index 76b69937a5f48..ed37b42aedf6c 100644 --- a/libc/src/sys/time/linux/utimes.cpp +++ b/libc/src/sys/time/linux/utimes.cpp @@ -15,7 +15,7 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include diff --git a/libc/src/sys/uio/linux/readv.cpp b/libc/src/sys/uio/linux/readv.cpp index f1393a9749be9..c9d8d87ddc72b 100644 --- a/libc/src/sys/uio/linux/readv.cpp +++ b/libc/src/sys/uio/linux/readv.cpp @@ -10,7 +10,7 @@ #include "hdr/types/struct_iovec.h" #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/uio/linux/writev.cpp b/libc/src/sys/uio/linux/writev.cpp index 8992bed95c982..b0b9e15207922 100644 --- a/libc/src/sys/uio/linux/writev.cpp +++ b/libc/src/sys/uio/linux/writev.cpp @@ -10,7 +10,7 @@ #include "hdr/types/struct_iovec.h" #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/sys/utsname/linux/uname.cpp b/libc/src/sys/utsname/linux/uname.cpp index 7bb227e801e3a..b47ba964faf0b 100644 --- a/libc/src/sys/utsname/linux/uname.cpp +++ b/libc/src/sys/utsname/linux/uname.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. #include diff --git a/libc/src/sys/wait/wait4Impl.h b/libc/src/sys/wait/wait4Impl.h index f2bdeb02f8668..77ed3ad22f148 100644 --- a/libc/src/sys/wait/wait4Impl.h +++ b/libc/src/sys/wait/wait4Impl.h @@ -12,8 +12,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" #include "src/__support/error_or.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include #include // For syscall numbers. diff --git a/libc/src/termios/linux/cfsetispeed.cpp b/libc/src/termios/linux/cfsetispeed.cpp index 9656b714a8ed2..47b19974d21be 100644 --- a/libc/src/termios/linux/cfsetispeed.cpp +++ b/libc/src/termios/linux/cfsetispeed.cpp @@ -9,8 +9,8 @@ #include "src/termios/cfsetispeed.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include diff --git a/libc/src/termios/linux/cfsetospeed.cpp b/libc/src/termios/linux/cfsetospeed.cpp index 6130d266dbff0..d2f138257a47a 100644 --- a/libc/src/termios/linux/cfsetospeed.cpp +++ b/libc/src/termios/linux/cfsetospeed.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/termios/cfsetospeed.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/__support/common.h" diff --git a/libc/src/termios/linux/tcdrain.cpp b/libc/src/termios/linux/tcdrain.cpp index 116e3f0e0cbc5..570b15c24fe7f 100644 --- a/libc/src/termios/linux/tcdrain.cpp +++ b/libc/src/termios/linux/tcdrain.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // Safe to include without the risk of name pollution. #include // For syscall numbers diff --git a/libc/src/termios/linux/tcflow.cpp b/libc/src/termios/linux/tcflow.cpp index d229230b5d138..714ef6aa71298 100644 --- a/libc/src/termios/linux/tcflow.cpp +++ b/libc/src/termios/linux/tcflow.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // Safe to include without the risk of name pollution. #include // For syscall numbers diff --git a/libc/src/termios/linux/tcflush.cpp b/libc/src/termios/linux/tcflush.cpp index 028a5414b1960..4c7b9fadc446d 100644 --- a/libc/src/termios/linux/tcflush.cpp +++ b/libc/src/termios/linux/tcflush.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // Safe to include without the risk of name pollution. #include // For syscall numbers diff --git a/libc/src/termios/linux/tcgetattr.cpp b/libc/src/termios/linux/tcgetattr.cpp index 63c096ff88eba..2e768269c874d 100644 --- a/libc/src/termios/linux/tcgetattr.cpp +++ b/libc/src/termios/linux/tcgetattr.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // Safe to include without the risk of name pollution. #include // For syscall numbers diff --git a/libc/src/termios/linux/tcgetsid.cpp b/libc/src/termios/linux/tcgetsid.cpp index c283d0e4fda9a..7487816cf2741 100644 --- a/libc/src/termios/linux/tcgetsid.cpp +++ b/libc/src/termios/linux/tcgetsid.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // Safe to include without the risk of name pollution. #include // For syscall numbers diff --git a/libc/src/termios/linux/tcsendbreak.cpp b/libc/src/termios/linux/tcsendbreak.cpp index 30bc91cf3de0a..1d546c1d5953e 100644 --- a/libc/src/termios/linux/tcsendbreak.cpp +++ b/libc/src/termios/linux/tcsendbreak.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // Safe to include without the risk of name pollution. #include // For syscall numbers diff --git a/libc/src/termios/linux/tcsetattr.cpp b/libc/src/termios/linux/tcsetattr.cpp index 8aa1e5c57b34e..8a2c7290217ba 100644 --- a/libc/src/termios/linux/tcsetattr.cpp +++ b/libc/src/termios/linux/tcsetattr.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // Safe to include without the risk of name pollution. #include // For syscall numbers diff --git a/libc/src/threads/thrd_create.cpp b/libc/src/threads/thrd_create.cpp index 4680944c2eee0..67e22e72fd0e4 100644 --- a/libc/src/threads/thrd_create.cpp +++ b/libc/src/threads/thrd_create.cpp @@ -8,9 +8,9 @@ #include "src/threads/thrd_create.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/threads/thread.h" -#include "src/errno/libc_errno.h" #include // For thrd_* type definitions. diff --git a/libc/src/time/linux/clock.cpp b/libc/src/time/linux/clock.cpp index ee4fa82b4f894..c38697cd0668e 100644 --- a/libc/src/time/linux/clock.cpp +++ b/libc/src/time/linux/clock.cpp @@ -10,10 +10,10 @@ #include "hdr/time_macros.h" #include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/time/clock_gettime.h" #include "src/__support/time/units.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/time/linux/clock_gettime.cpp b/libc/src/time/linux/clock_gettime.cpp index 743c644d65d02..b3fcd2b22f9da 100644 --- a/libc/src/time/linux/clock_gettime.cpp +++ b/libc/src/time/linux/clock_gettime.cpp @@ -8,9 +8,9 @@ #include "src/time/clock_gettime.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/time/clock_gettime.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/time/linux/gettimeofday.cpp b/libc/src/time/linux/gettimeofday.cpp index e8ddf482fc984..237b05903c70f 100644 --- a/libc/src/time/linux/gettimeofday.cpp +++ b/libc/src/time/linux/gettimeofday.cpp @@ -10,10 +10,10 @@ #include "hdr/time_macros.h" #include "hdr/types/suseconds_t.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/time/clock_gettime.h" #include "src/__support/time/units.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/time/linux/nanosleep.cpp b/libc/src/time/linux/nanosleep.cpp index 7a856376ffb20..6b9704126a0a5 100644 --- a/libc/src/time/linux/nanosleep.cpp +++ b/libc/src/time/linux/nanosleep.cpp @@ -10,8 +10,8 @@ #include "hdr/time_macros.h" #include "src/__support/OSUtil/syscall.h" // For syscall functions. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For int64_t. #include // For syscall numbers. diff --git a/libc/src/time/linux/timespec_get.cpp b/libc/src/time/linux/timespec_get.cpp index cf5174523aa4f..a4d4372332732 100644 --- a/libc/src/time/linux/timespec_get.cpp +++ b/libc/src/time/linux/timespec_get.cpp @@ -9,9 +9,9 @@ #include "src/time/timespec_get.h" #include "hdr/time_macros.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/time/clock_gettime.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/time/time.cpp b/libc/src/time/time.cpp index 860909af7488c..2a81f0182c313 100644 --- a/libc/src/time/time.cpp +++ b/libc/src/time/time.cpp @@ -10,9 +10,9 @@ #include "hdr/time_macros.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/time/clock_gettime.h" -#include "src/errno/libc_errno.h" namespace LIBC_NAMESPACE_DECL { // avoid inconsitent clang-format behavior diff --git a/libc/src/time/time_utils.h b/libc/src/time/time_utils.h index bbbb1c08a4759..0541c24ece82b 100644 --- a/libc/src/time/time_utils.h +++ b/libc/src/time/time_utils.h @@ -15,8 +15,8 @@ #include "src/__support/CPP/optional.h" #include "src/__support/CPP/string_view.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "time_constants.h" #include diff --git a/libc/src/time/windows/clock_getres.cpp b/libc/src/time/windows/clock_getres.cpp index b8c0c82aa6419..969bb66be2d25 100644 --- a/libc/src/time/windows/clock_getres.cpp +++ b/libc/src/time/windows/clock_getres.cpp @@ -13,10 +13,10 @@ #include "src/__support/CPP/limits.h" #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/optimization.h" #include "src/__support/time/units.h" #include "src/__support/time/windows/performance_counter.h" -#include "src/errno/libc_errno.h" #include "src/time/clock_getres.h" #define WIN32_LEAN_AND_MEAN diff --git a/libc/src/unistd/linux/access.cpp b/libc/src/unistd/linux/access.cpp index 2f7ebbcdf9e81..55cd6adca779d 100644 --- a/libc/src/unistd/linux/access.cpp +++ b/libc/src/unistd/linux/access.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/chdir.cpp b/libc/src/unistd/linux/chdir.cpp index a30d1dc883be8..04ba509b49a56 100644 --- a/libc/src/unistd/linux/chdir.cpp +++ b/libc/src/unistd/linux/chdir.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/close.cpp b/libc/src/unistd/linux/close.cpp index 58d42a9673fbe..b5842f2b64d20 100644 --- a/libc/src/unistd/linux/close.cpp +++ b/libc/src/unistd/linux/close.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/dup.cpp b/libc/src/unistd/linux/dup.cpp index c1710a37f6119..81d30c6cdbc4c 100644 --- a/libc/src/unistd/linux/dup.cpp +++ b/libc/src/unistd/linux/dup.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/dup2.cpp b/libc/src/unistd/linux/dup2.cpp index 7ffc151a053c9..0a0e86573b34e 100644 --- a/libc/src/unistd/linux/dup2.cpp +++ b/libc/src/unistd/linux/dup2.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/dup3.cpp b/libc/src/unistd/linux/dup3.cpp index c096ba73c96bd..770fb73515b21 100644 --- a/libc/src/unistd/linux/dup3.cpp +++ b/libc/src/unistd/linux/dup3.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/execv.cpp b/libc/src/unistd/linux/execv.cpp index a3f2525ed7ca1..d4f2bd9a51653 100644 --- a/libc/src/unistd/linux/execv.cpp +++ b/libc/src/unistd/linux/execv.cpp @@ -13,7 +13,7 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/execve.cpp b/libc/src/unistd/linux/execve.cpp index 37162c4121782..2214b6df493bd 100644 --- a/libc/src/unistd/linux/execve.cpp +++ b/libc/src/unistd/linux/execve.cpp @@ -13,7 +13,7 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/fchdir.cpp b/libc/src/unistd/linux/fchdir.cpp index 8196dc63ab1e1..f7a7422363e6e 100644 --- a/libc/src/unistd/linux/fchdir.cpp +++ b/libc/src/unistd/linux/fchdir.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/fork.cpp b/libc/src/unistd/linux/fork.cpp index 8aa0477a15d58..75a76fdea50b2 100644 --- a/libc/src/unistd/linux/fork.cpp +++ b/libc/src/unistd/linux/fork.cpp @@ -15,7 +15,7 @@ #include "src/__support/threads/identifier.h" #include "src/__support/threads/thread.h" // For thread self object -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include // For SIGCHLD #include // For syscall numbers. diff --git a/libc/src/unistd/linux/fsync.cpp b/libc/src/unistd/linux/fsync.cpp index ae3895bab15f3..fe08aed61e250 100644 --- a/libc/src/unistd/linux/fsync.cpp +++ b/libc/src/unistd/linux/fsync.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/ftruncate.cpp b/libc/src/unistd/linux/ftruncate.cpp index ccbb0634664aa..f6aa6f8b48cc9 100644 --- a/libc/src/unistd/linux/ftruncate.cpp +++ b/libc/src/unistd/linux/ftruncate.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/unistd_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For uint64_t. #include // For syscall numbers. diff --git a/libc/src/unistd/linux/getcwd.cpp b/libc/src/unistd/linux/getcwd.cpp index 1bb11a7c8e7ba..c0e475dd3e8ff 100644 --- a/libc/src/unistd/linux/getcwd.cpp +++ b/libc/src/unistd/linux/getcwd.cpp @@ -13,7 +13,7 @@ #include "src/__support/macros/config.h" #include "src/string/allocating_string_utils.h" // For strdup. -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include // This is safe to include without any name pollution. #include // For syscall numbers. diff --git a/libc/src/unistd/linux/getentropy.cpp b/libc/src/unistd/linux/getentropy.cpp index 168a1197734ed..65bcbf27601da 100644 --- a/libc/src/unistd/linux/getentropy.cpp +++ b/libc/src/unistd/linux/getentropy.cpp @@ -10,7 +10,7 @@ #include "hdr/errno_macros.h" #include "src/__support/OSUtil/syscall.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/unistd/linux/getsid.cpp b/libc/src/unistd/linux/getsid.cpp index 5977c5bf10e94..025b8d1691ac3 100644 --- a/libc/src/unistd/linux/getsid.cpp +++ b/libc/src/unistd/linux/getsid.cpp @@ -11,8 +11,8 @@ #include "hdr/types/pid_t.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/isatty.cpp b/libc/src/unistd/linux/isatty.cpp index e6ea22a714c78..a4d17912b57b0 100644 --- a/libc/src/unistd/linux/isatty.cpp +++ b/libc/src/unistd/linux/isatty.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For ioctl numbers. #include // For syscall numbers. diff --git a/libc/src/unistd/linux/link.cpp b/libc/src/unistd/linux/link.cpp index 477806a70df74..205cf8a84a5cb 100644 --- a/libc/src/unistd/linux/link.cpp +++ b/libc/src/unistd/linux/link.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/fcntl_macros.h" #include // For syscall numbers. diff --git a/libc/src/unistd/linux/linkat.cpp b/libc/src/unistd/linux/linkat.cpp index 40f68cc90c480..ea5bc48cbedc5 100644 --- a/libc/src/unistd/linux/linkat.cpp +++ b/libc/src/unistd/linux/linkat.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/lseek.cpp b/libc/src/unistd/linux/lseek.cpp index 0e957498da746..26a08269fd8de 100644 --- a/libc/src/unistd/linux/lseek.cpp +++ b/libc/src/unistd/linux/lseek.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/unistd/lseek.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/__support/File/linux/lseekImpl.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. diff --git a/libc/src/unistd/linux/pathconf.cpp b/libc/src/unistd/linux/pathconf.cpp index ca1c10bb9f7f6..7dde857c1cfd8 100644 --- a/libc/src/unistd/linux/pathconf.cpp +++ b/libc/src/unistd/linux/pathconf.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "src/unistd/pathconf.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/sys/statvfs/linux/statfs_utils.h" #include "src/unistd/linux/pathconf_utils.h" diff --git a/libc/src/unistd/linux/pathconf_utils.cpp b/libc/src/unistd/linux/pathconf_utils.cpp index 035e628dff253..9a62e31fd1880 100644 --- a/libc/src/unistd/linux/pathconf_utils.cpp +++ b/libc/src/unistd/linux/pathconf_utils.cpp @@ -14,8 +14,8 @@ #include "hdr/unistd_macros.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/sys/statvfs/linux/statfs_utils.h" // other linux specific includes diff --git a/libc/src/unistd/linux/pipe.cpp b/libc/src/unistd/linux/pipe.cpp index dfcd5bfdaf537..b9943c8338056 100644 --- a/libc/src/unistd/linux/pipe.cpp +++ b/libc/src/unistd/linux/pipe.cpp @@ -10,10 +10,10 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/sanitizer.h" // for MSAN_UNPOISON -#include "src/errno/libc_errno.h" -#include // For syscall numbers. +#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/pipe2.cpp b/libc/src/unistd/linux/pipe2.cpp index ebe7e0114ae99..d30f3b37a1adc 100644 --- a/libc/src/unistd/linux/pipe2.cpp +++ b/libc/src/unistd/linux/pipe2.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/pread.cpp b/libc/src/unistd/linux/pread.cpp index 3e27857f9a2b4..2f86e397feeff 100644 --- a/libc/src/unistd/linux/pread.cpp +++ b/libc/src/unistd/linux/pread.cpp @@ -10,11 +10,11 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/sanitizer.h" // for MSAN_UNPOISON -#include "src/errno/libc_errno.h" -#include // For uint64_t. -#include // For syscall numbers. +#include // For uint64_t. +#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/pwrite.cpp b/libc/src/unistd/linux/pwrite.cpp index 1b81b2a059494..f4cf8e16d766f 100644 --- a/libc/src/unistd/linux/pwrite.cpp +++ b/libc/src/unistd/linux/pwrite.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For uint64_t. #include // For syscall numbers. diff --git a/libc/src/unistd/linux/read.cpp b/libc/src/unistd/linux/read.cpp index 4419900f2330e..55676f3f7010a 100644 --- a/libc/src/unistd/linux/read.cpp +++ b/libc/src/unistd/linux/read.cpp @@ -10,10 +10,10 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/sanitizer.h" // for MSAN_UNPOISON -#include "src/errno/libc_errno.h" -#include // For syscall numbers. +#include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/readlink.cpp b/libc/src/unistd/linux/readlink.cpp index 2055e6b3400f2..b297a41ca37bd 100644 --- a/libc/src/unistd/linux/readlink.cpp +++ b/libc/src/unistd/linux/readlink.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/readlinkat.cpp b/libc/src/unistd/linux/readlinkat.cpp index e5e4d0d39bc9c..cd0dcb8e0ff02 100644 --- a/libc/src/unistd/linux/readlinkat.cpp +++ b/libc/src/unistd/linux/readlinkat.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/rmdir.cpp b/libc/src/unistd/linux/rmdir.cpp index 075af12af64c5..eca6e954ef898 100644 --- a/libc/src/unistd/linux/rmdir.cpp +++ b/libc/src/unistd/linux/rmdir.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/fcntl_macros.h" #include // For syscall numbers. diff --git a/libc/src/unistd/linux/symlink.cpp b/libc/src/unistd/linux/symlink.cpp index 9e1b2886ea0f5..3f43de19d2f46 100644 --- a/libc/src/unistd/linux/symlink.cpp +++ b/libc/src/unistd/linux/symlink.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/fcntl_macros.h" #include // For syscall numbers. diff --git a/libc/src/unistd/linux/symlinkat.cpp b/libc/src/unistd/linux/symlinkat.cpp index bcf2d0f8cc055..8cee172f39dfa 100644 --- a/libc/src/unistd/linux/symlinkat.cpp +++ b/libc/src/unistd/linux/symlinkat.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/syscall.cpp b/libc/src/unistd/linux/syscall.cpp index 5394bff46adfa..0f7b3da88d627 100644 --- a/libc/src/unistd/linux/syscall.cpp +++ b/libc/src/unistd/linux/syscall.cpp @@ -11,8 +11,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/sysconf.cpp b/libc/src/unistd/linux/sysconf.cpp index f785ff321c7d7..03f224b150273 100644 --- a/libc/src/unistd/linux/sysconf.cpp +++ b/libc/src/unistd/linux/sysconf.cpp @@ -11,8 +11,8 @@ #include "src/__support/common.h" #include "hdr/unistd_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/sys/auxv/getauxval.h" #include diff --git a/libc/src/unistd/linux/truncate.cpp b/libc/src/unistd/linux/truncate.cpp index 8236edb480d10..6103d4b51350b 100644 --- a/libc/src/unistd/linux/truncate.cpp +++ b/libc/src/unistd/linux/truncate.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/unistd_macros.h" #include // For uint64_t. diff --git a/libc/src/unistd/linux/unlink.cpp b/libc/src/unistd/linux/unlink.cpp index 72d8e2398e3d7..5fde2600937b2 100644 --- a/libc/src/unistd/linux/unlink.cpp +++ b/libc/src/unistd/linux/unlink.cpp @@ -12,8 +12,8 @@ #include "src/__support/common.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. namespace LIBC_NAMESPACE_DECL { diff --git a/libc/src/unistd/linux/unlinkat.cpp b/libc/src/unistd/linux/unlinkat.cpp index 4ed20f542f170..b2012c52b8854 100644 --- a/libc/src/unistd/linux/unlinkat.cpp +++ b/libc/src/unistd/linux/unlinkat.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "hdr/fcntl_macros.h" #include // For syscall numbers. diff --git a/libc/src/unistd/linux/write.cpp b/libc/src/unistd/linux/write.cpp index 99d5ab7e480b0..eecb74429182a 100644 --- a/libc/src/unistd/linux/write.cpp +++ b/libc/src/unistd/linux/write.cpp @@ -10,8 +10,8 @@ #include "src/__support/OSUtil/syscall.h" // For internal syscall function. #include "src/__support/common.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include // For syscall numbers. diff --git a/libc/src/unistd/windows/getentropy.cpp b/libc/src/unistd/windows/getentropy.cpp index bfaec723ac63d..e25a7a8fed406 100644 --- a/libc/src/unistd/windows/getentropy.cpp +++ b/libc/src/unistd/windows/getentropy.cpp @@ -9,7 +9,7 @@ #include "src/unistd/getentropy.h" #include "hdr/errno_macros.h" #include "src/__support/common.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #define WIN32_LEAN_AND_MEAN #include diff --git a/libc/test/IntegrationTest/test.h b/libc/test/IntegrationTest/test.h index 5be66d9edff02..24c007d2e12e6 100644 --- a/libc/test/IntegrationTest/test.h +++ b/libc/test/IntegrationTest/test.h @@ -68,12 +68,9 @@ //////////////////////////////////////////////////////////////////////////////// // Errno checks. -#define ASSERT_ERRNO_EQ(VAL) \ - ASSERT_EQ(VAL, static_cast(LIBC_NAMESPACE::libc_errno)) -#define ASSERT_ERRNO_SUCCESS() \ - ASSERT_EQ(0, static_cast(LIBC_NAMESPACE::libc_errno)) -#define ASSERT_ERRNO_FAILURE() \ - ASSERT_NE(0, static_cast(LIBC_NAMESPACE::libc_errno)) +#define ASSERT_ERRNO_EQ(VAL) ASSERT_EQ(VAL, static_cast(libc_errno)) +#define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast(libc_errno)) +#define ASSERT_ERRNO_FAILURE() ASSERT_NE(0, static_cast(libc_errno)) // Integration tests are compiled with -ffreestanding which stops treating // the main function as a non-overloadable special function. Hence, we use a diff --git a/libc/test/UnitTest/ErrnoCheckingTest.h b/libc/test/UnitTest/ErrnoCheckingTest.h index 3d3b72f80544f..4b7ff452f409c 100644 --- a/libc/test/UnitTest/ErrnoCheckingTest.h +++ b/libc/test/UnitTest/ErrnoCheckingTest.h @@ -9,8 +9,8 @@ #ifndef LLVM_LIBC_TEST_UNITTEST_ERRNOCHECKINGTEST_H #define LLVM_LIBC_TEST_UNITTEST_ERRNOCHECKINGTEST_H +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "test/UnitTest/Test.h" namespace LIBC_NAMESPACE_DECL { @@ -25,7 +25,7 @@ class ErrnoCheckingTest : public Test { public: void SetUp() override { Test::SetUp(); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; } void TearDown() override { diff --git a/libc/test/UnitTest/ErrnoSetterMatcher.h b/libc/test/UnitTest/ErrnoSetterMatcher.h index c6eadd25858ea..212b7a8f83e74 100644 --- a/libc/test/UnitTest/ErrnoSetterMatcher.h +++ b/libc/test/UnitTest/ErrnoSetterMatcher.h @@ -12,9 +12,9 @@ #include "src/__support/FPUtil/FPBits.h" #include "src/__support/FPUtil/fpbits_str.h" #include "src/__support/StringUtil/error_to_string.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/macros/properties/architectures.h" -#include "src/errno/libc_errno.h" #include "test/UnitTest/Test.h" namespace LIBC_NAMESPACE_DECL { @@ -114,8 +114,8 @@ template class ErrnoSetterMatcher : public Matcher { bool match(T got) { actual_return = got; - actual_errno = LIBC_NAMESPACE::libc_errno; - LIBC_NAMESPACE::libc_errno = 0; + actual_errno = libc_errno; + libc_errno = 0; if constexpr (ignore_errno()) return return_cmp.compare(actual_return); else diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h index 21b8a45b0726f..da15cf2907f7c 100644 --- a/libc/test/UnitTest/FPMatcher.h +++ b/libc/test/UnitTest/FPMatcher.h @@ -279,8 +279,8 @@ struct ModifyMXCSR { #define EXPECT_MATH_ERRNO(expected) \ do { \ if (math_errhandling & MATH_ERRNO) { \ - int actual = LIBC_NAMESPACE::libc_errno; \ - LIBC_NAMESPACE::libc_errno = 0; \ + int actual = libc_errno; \ + libc_errno = 0; \ EXPECT_EQ(actual, expected); \ } \ } while (0) @@ -288,8 +288,8 @@ struct ModifyMXCSR { #define ASSERT_MATH_ERRNO(expected) \ do { \ if (math_errhandling & MATH_ERRNO) { \ - int actual = LIBC_NAMESPACE::libc_errno; \ - LIBC_NAMESPACE::libc_errno = 0; \ + int actual = libc_errno; \ + libc_errno = 0; \ ASSERT_EQ(actual, expected); \ } \ } while (0) diff --git a/libc/test/UnitTest/Test.h b/libc/test/UnitTest/Test.h index 95d48f40914ed..a5a2a3c7cf58e 100644 --- a/libc/test/UnitTest/Test.h +++ b/libc/test/UnitTest/Test.h @@ -42,15 +42,14 @@ #define ASSERT_ERRNO_EQ(VAL) \ do { \ - ASSERT_EQ(VAL, static_cast(LIBC_NAMESPACE::libc_errno)); \ - LIBC_NAMESPACE::libc_errno = 0; \ + ASSERT_EQ(VAL, static_cast(libc_errno)); \ + libc_errno = 0; \ } while (0) -#define ASSERT_ERRNO_SUCCESS() \ - ASSERT_EQ(0, static_cast(LIBC_NAMESPACE::libc_errno)) +#define ASSERT_ERRNO_SUCCESS() ASSERT_EQ(0, static_cast(libc_errno)) #define ASSERT_ERRNO_FAILURE() \ do { \ - ASSERT_NE(0, static_cast(LIBC_NAMESPACE::libc_errno)); \ - LIBC_NAMESPACE::libc_errno = 0; \ + ASSERT_NE(0, static_cast(libc_errno)); \ + libc_errno = 0; \ } while (0) #endif // LLVM_LIBC_TEST_UNITTEST_TEST_H diff --git a/libc/test/integration/src/pthread/pthread_create_test.cpp b/libc/test/integration/src/pthread/pthread_create_test.cpp index 29da4d5c3c8d7..aecbad6514aaa 100644 --- a/libc/test/integration/src/pthread/pthread_create_test.cpp +++ b/libc/test/integration/src/pthread/pthread_create_test.cpp @@ -29,7 +29,7 @@ #include "src/__support/CPP/new.h" #include "src/__support/threads/thread.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "test/IntegrationTest/test.h" @@ -332,7 +332,7 @@ static void run_failure_tests() { } TEST_MAIN() { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; run_success_tests(); run_failure_tests(); return 0; diff --git a/libc/test/integration/src/pthread/pthread_join_test.cpp b/libc/test/integration/src/pthread/pthread_join_test.cpp index 994fa57a6b337..5d0bcd8e23658 100644 --- a/libc/test/integration/src/pthread/pthread_join_test.cpp +++ b/libc/test/integration/src/pthread/pthread_join_test.cpp @@ -9,7 +9,7 @@ #include "src/pthread/pthread_create.h" #include "src/pthread/pthread_join.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "test/IntegrationTest/test.h" #include @@ -25,7 +25,7 @@ static void nullJoinTest() { } TEST_MAIN() { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; nullJoinTest(); return 0; } diff --git a/libc/test/integration/src/pthread/pthread_name_test.cpp b/libc/test/integration/src/pthread/pthread_name_test.cpp index 37ceceee880de..35dd3b165e0ee 100644 --- a/libc/test/integration/src/pthread/pthread_name_test.cpp +++ b/libc/test/integration/src/pthread/pthread_name_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/CPP/string_view.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/pthread/pthread_create.h" #include "src/pthread/pthread_getname_np.h" #include "src/pthread/pthread_join.h" diff --git a/libc/test/integration/src/unistd/getcwd_test.cpp b/libc/test/integration/src/unistd/getcwd_test.cpp index 551768187bf01..1b321b01e9315 100644 --- a/libc/test/integration/src/unistd/getcwd_test.cpp +++ b/libc/test/integration/src/unistd/getcwd_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/CPP/string_view.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/stdlib/getenv.h" #include "src/unistd/getcwd.h" @@ -31,12 +31,12 @@ TEST_MAIN(int argc, char **argv, char **envp) { cwd = LIBC_NAMESPACE::getcwd(buffer, 0); ASSERT_TRUE(cwd == nullptr); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // Insufficient size cwd = LIBC_NAMESPACE::getcwd(buffer, 2); ASSERT_TRUE(cwd == nullptr); - int err = LIBC_NAMESPACE::libc_errno; + int err = libc_errno; ASSERT_EQ(err, ERANGE); return 0; diff --git a/libc/test/integration/startup/linux/tls_test.cpp b/libc/test/integration/startup/linux/tls_test.cpp index ef9fd9fcb7ff4..de3bd06c39cf6 100644 --- a/libc/test/integration/startup/linux/tls_test.cpp +++ b/libc/test/integration/startup/linux/tls_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sys/mman/mmap.h" #include "test/IntegrationTest/test.h" diff --git a/libc/test/src/__support/str_to_fp_test.h b/libc/test/src/__support/str_to_fp_test.h index d349192f107c0..9b4844d410db2 100644 --- a/libc/test/src/__support/str_to_fp_test.h +++ b/libc/test/src/__support/str_to_fp_test.h @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_float.h" #include "src/__support/uint128.h" diff --git a/libc/test/src/__support/str_to_integer_test.cpp b/libc/test/src/__support/str_to_integer_test.cpp index 1ec882b212b8a..40cb76a8bd6a2 100644 --- a/libc/test/src/__support/str_to_integer_test.cpp +++ b/libc/test/src/__support/str_to_integer_test.cpp @@ -6,6 +6,7 @@ // //===----------------------------------------------------------------------===// +#include "src/__support/libc_errno.h" #include "src/__support/str_to_integer.h" #include diff --git a/libc/test/src/dirent/dirent_test.cpp b/libc/test/src/dirent/dirent_test.cpp index 41f522a6a75fb..3f0095ca5ebe8 100644 --- a/libc/test/src/dirent/dirent_test.cpp +++ b/libc/test/src/dirent/dirent_test.cpp @@ -7,11 +7,11 @@ //===----------------------------------------------------------------------===// #include "src/__support/CPP/string_view.h" +#include "src/__support/libc_errno.h" #include "src/dirent/closedir.h" #include "src/dirent/dirfd.h" #include "src/dirent/opendir.h" #include "src/dirent/readdir.h" -#include "src/errno/libc_errno.h" #include "test/UnitTest/Test.h" @@ -55,17 +55,17 @@ TEST(LlvmLibcDirentTest, SimpleOpenAndRead) { } TEST(LlvmLibcDirentTest, OpenNonExistentDir) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ::DIR *dir = LIBC_NAMESPACE::opendir("___xyz123__.non_existent__"); ASSERT_TRUE(dir == nullptr); ASSERT_ERRNO_EQ(ENOENT); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; } TEST(LlvmLibcDirentTest, OpenFile) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ::DIR *dir = LIBC_NAMESPACE::opendir("testdata/file1.txt"); ASSERT_TRUE(dir == nullptr); ASSERT_ERRNO_EQ(ENOTDIR); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; } diff --git a/libc/test/src/errno/errno_test.cpp b/libc/test/src/errno/errno_test.cpp index b0db22a85f3bc..de82b0077f177 100644 --- a/libc/test/src/errno/errno_test.cpp +++ b/libc/test/src/errno/errno_test.cpp @@ -6,11 +6,11 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "test/UnitTest/Test.h" TEST(LlvmLibcErrnoTest, Basic) { int test_val = 123; - LIBC_NAMESPACE::libc_errno = test_val; + libc_errno = test_val; ASSERT_ERRNO_EQ(test_val); } diff --git a/libc/test/src/fcntl/creat_test.cpp b/libc/test/src/fcntl/creat_test.cpp index 4c9d2cbc33f47..d60c984934703 100644 --- a/libc/test/src/fcntl/creat_test.cpp +++ b/libc/test/src/fcntl/creat_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/creat.h" #include "src/fcntl/open.h" #include "src/unistd/close.h" diff --git a/libc/test/src/fcntl/fcntl_test.cpp b/libc/test/src/fcntl/fcntl_test.cpp index 1a21afe51085b..082c42481777b 100644 --- a/libc/test/src/fcntl/fcntl_test.cpp +++ b/libc/test/src/fcntl/fcntl_test.cpp @@ -9,7 +9,7 @@ #include "hdr/fcntl_macros.h" #include "hdr/stdio_macros.h" #include "hdr/types/struct_flock.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/fcntl.h" #include "src/fcntl/open.h" #include "src/unistd/close.h" @@ -166,7 +166,7 @@ TEST(LlvmLibcFcntlTest, UseAfterClose) { } TEST(LlvmLibcFcntlTest, SetGetOwnerTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; pid_t pid = LIBC_NAMESPACE::getpid(); ASSERT_GT(pid, -1); diff --git a/libc/test/src/fcntl/openat_test.cpp b/libc/test/src/fcntl/openat_test.cpp index 213b074799c8d..1997476f16a60 100644 --- a/libc/test/src/fcntl/openat_test.cpp +++ b/libc/test/src/fcntl/openat_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/fcntl/openat.h" #include "src/unistd/close.h" diff --git a/libc/test/src/math/RoundToIntegerTest.h b/libc/test/src/math/RoundToIntegerTest.h index 77b465a3a0e63..6af9cfea0e0a5 100644 --- a/libc/test/src/math/RoundToIntegerTest.h +++ b/libc/test/src/math/RoundToIntegerTest.h @@ -55,7 +55,7 @@ class RoundToIntegerTestTemplate void test_one_input(RoundToIntegerFunc func, FloatType input, IntType expected, bool expectError) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT); ASSERT_EQ(func(input), expected); diff --git a/libc/test/src/math/acosf_test.cpp b/libc/test/src/math/acosf_test.cpp index 2e4c8eb2ab961..aa0128fee999b 100644 --- a/libc/test/src/math/acosf_test.cpp +++ b/libc/test/src/math/acosf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr; using LlvmLibcAcosfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAcosfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::acosf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/acoshf16_test.cpp b/libc/test/src/math/acoshf16_test.cpp index 7348018396bd7..2eb95215e4e8b 100644 --- a/libc/test/src/math/acoshf16_test.cpp +++ b/libc/test/src/math/acoshf16_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acoshf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/acoshf_test.cpp b/libc/test/src/math/acoshf_test.cpp index 18ed5a11d50a7..3d3b827411a4a 100644 --- a/libc/test/src/math/acoshf_test.cpp +++ b/libc/test/src/math/acoshf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acoshf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ using LlvmLibcAcoshfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcAcoshfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::acoshf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/asin_test.cpp b/libc/test/src/math/asin_test.cpp index 385e341318aea..03ae963e9f924 100644 --- a/libc/test/src/math/asin_test.cpp +++ b/libc/test/src/math/asin_test.cpp @@ -38,7 +38,7 @@ TEST_F(LlvmLibcAsinTest, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::asin(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/asinf_test.cpp b/libc/test/src/math/asinf_test.cpp index 5197810d8bd58..1eaa6b8a51359 100644 --- a/libc/test/src/math/asinf_test.cpp +++ b/libc/test/src/math/asinf_test.cpp @@ -9,7 +9,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/asinf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -22,7 +22,7 @@ using LlvmLibcAsinfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcAsinfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::asinf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/asinhf_test.cpp b/libc/test/src/math/asinhf_test.cpp index ac125c3520c44..8c78f939cabf7 100644 --- a/libc/test/src/math/asinhf_test.cpp +++ b/libc/test/src/math/asinhf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/asinhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ using LlvmLibcAsinhfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcAsinhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::asinhf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/atan2f_test.cpp b/libc/test/src/math/atan2f_test.cpp index 331f4281af839..50ab38208089a 100644 --- a/libc/test/src/math/atan2f_test.cpp +++ b/libc/test/src/math/atan2f_test.cpp @@ -81,7 +81,7 @@ TEST_F(LlvmLibcAtan2fTest, InFloatRange) { if (FPBits(w).is_nan() || FPBits(w).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::atan2f(x, y); ++total_count; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/atan_test.cpp b/libc/test/src/math/atan_test.cpp index 7f52578b9efed..7fa0dffd607e2 100644 --- a/libc/test/src/math/atan_test.cpp +++ b/libc/test/src/math/atan_test.cpp @@ -39,7 +39,7 @@ TEST_F(LlvmLibcAtanTest, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::atan(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/atanf_test.cpp b/libc/test/src/math/atanf_test.cpp index 575ec89bd493c..a4bdf1867c39c 100644 --- a/libc/test/src/math/atanf_test.cpp +++ b/libc/test/src/math/atanf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/atanf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -23,7 +23,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr; // TODO: This test needs to have its checks for exceptions, errno // tightened TEST_F(LlvmLibcAtanfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT); EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanf(aNaN)); // TODO: Uncomment these checks later, RoundingMode affects running diff --git a/libc/test/src/math/atanhf_test.cpp b/libc/test/src/math/atanhf_test.cpp index 8b9db1dfdd976..32272ef482ab2 100644 --- a/libc/test/src/math/atanhf_test.cpp +++ b/libc/test/src/math/atanhf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/atanhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -25,7 +25,7 @@ namespace mpfr = LIBC_NAMESPACE::testing::mpfr; // tightened https://github.com/llvm/llvm-project/issues/88819. TEST_F(LlvmLibcAtanhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT); EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf(aNaN)); // TODO: Uncomment these checks later, RoundingMode affects running diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp index 2143c36f3d30b..90dc8ff6a0ea4 100644 --- a/libc/test/src/math/cosf_test.cpp +++ b/libc/test/src/math/cosf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/cosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -23,7 +23,7 @@ using LlvmLibcCosfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcCosfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cosf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/coshf_test.cpp b/libc/test/src/math/coshf_test.cpp index 0d1c322b8e622..bdaba50f1f148 100644 --- a/libc/test/src/math/coshf_test.cpp +++ b/libc/test/src/math/coshf_test.cpp @@ -9,7 +9,7 @@ #include "hdr/math_macros.h" #include "src/__support/CPP/array.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/coshf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -22,7 +22,7 @@ using LlvmLibcCoshfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcCoshfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::coshf(aNaN)); EXPECT_MATH_ERRNO(0); @@ -41,7 +41,7 @@ TEST_F(LlvmLibcCoshfTest, SpecialNumbers) { } TEST_F(LlvmLibcCoshfTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::coshf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/cospif_test.cpp b/libc/test/src/math/cospif_test.cpp index 37ec2516f6a35..cb88bfcade0dc 100644 --- a/libc/test/src/math/cospif_test.cpp +++ b/libc/test/src/math/cospif_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/cospif.h" #include "test/UnitTest/FPMatcher.h" #include "test/src/math/sdcomp26094.h" @@ -19,7 +19,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcCospifTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::cospif(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/exp10_test.cpp b/libc/test/src/math/exp10_test.cpp index 6fb1d2d9d925e..6126e5f211fff 100644 --- a/libc/test/src/math/exp10_test.cpp +++ b/libc/test/src/math/exp10_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -105,7 +105,7 @@ TEST_F(LlvmLibcExp10Test, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::exp10(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/exp10f_test.cpp b/libc/test/src/math/exp10f_test.cpp index 001b37809d930..89915961c9b90 100644 --- a/libc/test/src/math/exp10f_test.cpp +++ b/libc/test/src/math/exp10f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ using LlvmLibcExp10fTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcExp10fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::exp10f(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp10fTest, SpecialNumbers) { } TEST_F(LlvmLibcExp10fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::exp10f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); @@ -55,7 +55,7 @@ TEST_F(LlvmLibcExp10fTest, Overflow) { } TEST_F(LlvmLibcExp10fTest, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( 0.0f, LIBC_NAMESPACE::exp10f(FPBits(0xff7fffffU).get_val()), FE_UNDERFLOW); @@ -97,7 +97,7 @@ TEST_F(LlvmLibcExp10fTest, TrickyInputs) { 0x41200000, // x = 10.0f }; for (int i = 0; i < N; ++i) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x, LIBC_NAMESPACE::exp10f(x), 0.5); @@ -113,15 +113,14 @@ TEST_F(LlvmLibcExp10fTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::exp10f(x); // If the computation resulted in an error or did not produce valid result // in the single-precision floating point range, then ignore comparing with // MPFR result as MPFR can still produce valid results because of its // wider precision. - if (FPBits(result).is_nan() || FPBits(result).is_inf() || - LIBC_NAMESPACE::libc_errno != 0) + if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0) continue; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x, LIBC_NAMESPACE::exp10f(x), 0.5); diff --git a/libc/test/src/math/exp10m1f_test.cpp b/libc/test/src/math/exp10m1f_test.cpp index aee273384f1a2..01802bd68f7e4 100644 --- a/libc/test/src/math/exp10m1f_test.cpp +++ b/libc/test/src/math/exp10m1f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/CPP/array.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10m1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -69,7 +69,7 @@ TEST_F(LlvmLibcExp10m1fTest, TrickyInputs) { }; for (float x : INPUTS) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x, LIBC_NAMESPACE::exp10m1f(x), 0.5); } @@ -82,14 +82,14 @@ TEST_F(LlvmLibcExp10m1fTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_inf_or_nan()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::exp10m1f(x); // If the computation resulted in an error or did not produce valid result // in the single-precision floating point range, then ignore comparing with // MPFR result as MPFR can still produce valid results because of its // wider precision. - if (FPBits(result).is_inf_or_nan() || LIBC_NAMESPACE::libc_errno != 0) + if (FPBits(result).is_inf_or_nan() || libc_errno != 0) continue; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x, LIBC_NAMESPACE::exp10m1f(x), 0.5); diff --git a/libc/test/src/math/exp2_test.cpp b/libc/test/src/math/exp2_test.cpp index adfceceeef4b7..4cd95dd5486ed 100644 --- a/libc/test/src/math/exp2_test.cpp +++ b/libc/test/src/math/exp2_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -80,7 +80,7 @@ TEST_F(LlvmLibcExp2Test, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::exp2(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/exp2f_test.cpp b/libc/test/src/math/exp2f_test.cpp index 0c4c821534392..aeecb3e74b07a 100644 --- a/libc/test/src/math/exp2f_test.cpp +++ b/libc/test/src/math/exp2f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ using LlvmLibcExp2fTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcExp2fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::exp2f(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp2fTest, SpecialNumbers) { } TEST_F(LlvmLibcExp2fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::exp2f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); @@ -71,7 +71,7 @@ TEST_F(LlvmLibcExp2fTest, TrickyInputs) { 0xc3150000U, /*-0x1.2ap+7f*/ }; for (int i = 0; i < N; ++i) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float x = FPBits(INPUTS[i]).get_val(); EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x, LIBC_NAMESPACE::exp2f(x), 0.5); @@ -80,7 +80,7 @@ TEST_F(LlvmLibcExp2fTest, TrickyInputs) { } TEST_F(LlvmLibcExp2fTest, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( 0.0f, LIBC_NAMESPACE::exp2f(FPBits(0xff7fffffU).get_val()), FE_UNDERFLOW); EXPECT_MATH_ERRNO(ERANGE); @@ -108,15 +108,14 @@ TEST_F(LlvmLibcExp2fTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::exp2f(x); // If the computation resulted in an error or did not produce valid result // in the single-precision floating point range, then ignore comparing with // MPFR result as MPFR can still produce valid results because of its // wider precision. - if (FPBits(result).is_nan() || FPBits(result).is_inf() || - LIBC_NAMESPACE::libc_errno != 0) + if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0) continue; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x, LIBC_NAMESPACE::exp2f(x), 0.5); diff --git a/libc/test/src/math/exp2m1f_test.cpp b/libc/test/src/math/exp2m1f_test.cpp index 793cf0cc2cbb4..0c87657abc085 100644 --- a/libc/test/src/math/exp2m1f_test.cpp +++ b/libc/test/src/math/exp2m1f_test.cpp @@ -9,7 +9,7 @@ #include "hdr/math_macros.h" #include "src/__support/CPP/array.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2m1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -38,7 +38,7 @@ TEST_F(LlvmLibcExp2m1fTest, TrickyInputs) { }; for (float x : INPUTS) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x, LIBC_NAMESPACE::exp2m1f(x), 0.5); } @@ -51,15 +51,14 @@ TEST_F(LlvmLibcExp2m1fTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::exp2m1f(x); // If the computation resulted in an error or did not produce valid result // in the single-precision floating point range, then ignore comparing with // MPFR result as MPFR can still produce valid results because of its // wider precision. - if (FPBits(result).is_nan() || FPBits(result).is_inf() || - LIBC_NAMESPACE::libc_errno != 0) + if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0) continue; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x, LIBC_NAMESPACE::exp2m1f(x), 0.5); diff --git a/libc/test/src/math/exp_test.cpp b/libc/test/src/math/exp_test.cpp index 0ab3a4e543464..83addaeb943d8 100644 --- a/libc/test/src/math/exp_test.cpp +++ b/libc/test/src/math/exp_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -78,7 +78,7 @@ TEST_F(LlvmLibcExpTest, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::exp(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/expf_test.cpp b/libc/test/src/math/expf_test.cpp index 26a0bca4ce253..3c10812ff5bc2 100644 --- a/libc/test/src/math/expf_test.cpp +++ b/libc/test/src/math/expf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ using LlvmLibcExpfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcExpfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::expf(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpfTest, SpecialNumbers) { } TEST_F(LlvmLibcExpfTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::expf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); @@ -55,7 +55,7 @@ TEST_F(LlvmLibcExpfTest, Overflow) { } TEST_F(LlvmLibcExpfTest, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( 0.0f, LIBC_NAMESPACE::expf(FPBits(0xff7fffffU).get_val()), FE_UNDERFLOW); EXPECT_MATH_ERRNO(ERANGE); @@ -76,7 +76,7 @@ TEST_F(LlvmLibcExpfTest, Underflow) { TEST_F(LlvmLibcExpfTest, Borderline) { float x; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; x = FPBits(0x42affff8U).get_val(); ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x, LIBC_NAMESPACE::expf(x), 0.5); @@ -110,15 +110,14 @@ TEST_F(LlvmLibcExpfTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::expf(x); // If the computation resulted in an error or did not produce valid result // in the single-precision floating point range, then ignore comparing with // MPFR result as MPFR can still produce valid results because of its // wider precision. - if (FPBits(result).is_nan() || FPBits(result).is_inf() || - LIBC_NAMESPACE::libc_errno != 0) + if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0) continue; EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x, LIBC_NAMESPACE::expf(x), 0.5); diff --git a/libc/test/src/math/expm1_test.cpp b/libc/test/src/math/expm1_test.cpp index 9720773d9f960..0cf07e2e49734 100644 --- a/libc/test/src/math/expm1_test.cpp +++ b/libc/test/src/math/expm1_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expm1.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -64,7 +64,7 @@ TEST_F(LlvmLibcExpm1Test, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::expm1(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/expm1f_test.cpp b/libc/test/src/math/expm1f_test.cpp index 274fe3bb7afb0..cf3fe9c26ae18 100644 --- a/libc/test/src/math/expm1f_test.cpp +++ b/libc/test/src/math/expm1f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expm1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ using LlvmLibcExpm1fTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::expm1f(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) { } TEST_F(LlvmLibcExpm1fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::expm1f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); @@ -55,7 +55,7 @@ TEST_F(LlvmLibcExpm1fTest, Overflow) { } TEST_F(LlvmLibcExpm1fTest, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(-1.0f, LIBC_NAMESPACE::expm1f(FPBits(0xff7fffffU).get_val())); float x = FPBits(0xc2cffff8U).get_val(); @@ -70,7 +70,7 @@ TEST_F(LlvmLibcExpm1fTest, Underflow) { TEST_F(LlvmLibcExpm1fTest, Borderline) { float x; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; x = FPBits(0x42affff8U).get_val(); ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x, LIBC_NAMESPACE::expm1f(x), 0.5); @@ -119,15 +119,14 @@ TEST_F(LlvmLibcExpm1fTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::expm1f(x); // If the computation resulted in an error or did not produce valid result // in the single-precision floating point range, then ignore comparing with // MPFR result as MPFR can still produce valid results because of its // wider precision. - if (FPBits(result).is_nan() || FPBits(result).is_inf() || - LIBC_NAMESPACE::libc_errno != 0) + if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0) continue; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x, LIBC_NAMESPACE::expm1f(x), 0.5); diff --git a/libc/test/src/math/log10_test.cpp b/libc/test/src/math/log10_test.cpp index 01aa1f82ae5d8..e9529d87c3885 100644 --- a/libc/test/src/math/log10_test.cpp +++ b/libc/test/src/math/log10_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log10.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -101,7 +101,7 @@ TEST_F(LlvmLibcLog10Test, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::log10(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/log1p_test.cpp b/libc/test/src/math/log1p_test.cpp index 107e965a0d3ae..e5747b7e5ec0b 100644 --- a/libc/test/src/math/log1p_test.cpp +++ b/libc/test/src/math/log1p_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log1p.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -102,7 +102,7 @@ TEST_F(LlvmLibcLog1pTest, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::log1p(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/log1pf_test.cpp b/libc/test/src/math/log1pf_test.cpp index bb181dc5e43b0..ffe2dd2c33dd6 100644 --- a/libc/test/src/math/log1pf_test.cpp +++ b/libc/test/src/math/log1pf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log1pf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -75,7 +75,7 @@ TEST_F(LlvmLibcLog1pfTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log1p, x, LIBC_NAMESPACE::log1pf(x), 0.5); } diff --git a/libc/test/src/math/log2_test.cpp b/libc/test/src/math/log2_test.cpp index 8a07991a68886..fc440c09b42bd 100644 --- a/libc/test/src/math/log2_test.cpp +++ b/libc/test/src/math/log2_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log2.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -100,7 +100,7 @@ TEST_F(LlvmLibcLog2Test, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::log2(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/log2f_test.cpp b/libc/test/src/math/log2f_test.cpp index 83691fb75300e..92226c763f458 100644 --- a/libc/test/src/math/log2f_test.cpp +++ b/libc/test/src/math/log2f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log2f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -52,14 +52,13 @@ TEST_F(LlvmLibcLog2fTest, InFloatRange) { float x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::log2f(x); // If the computation resulted in an error or did not produce valid result // in the single-precision floating point range, then ignore comparing with // MPFR result as MPFR can still produce valid results because of its // wider precision. - if (FPBits(result).is_nan() || FPBits(result).is_inf() || - LIBC_NAMESPACE::libc_errno != 0) + if (FPBits(result).is_nan() || FPBits(result).is_inf() || libc_errno != 0) continue; ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log2, x, LIBC_NAMESPACE::log2f(x), 0.5); diff --git a/libc/test/src/math/log_test.cpp b/libc/test/src/math/log_test.cpp index 969a469b2e1c6..54afaa33d1350 100644 --- a/libc/test/src/math/log_test.cpp +++ b/libc/test/src/math/log_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -99,7 +99,7 @@ TEST_F(LlvmLibcLogTest, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf() || x < 0.0) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::log(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/powf_test.cpp b/libc/test/src/math/powf_test.cpp index 448dcc0035e9b..4d189d813e584 100644 --- a/libc/test/src/math/powf_test.cpp +++ b/libc/test/src/math/powf_test.cpp @@ -78,7 +78,7 @@ TEST_F(LlvmLibcPowfTest, InFloatRange) { if (FPBits(w).is_nan() || FPBits(w).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float result = LIBC_NAMESPACE::powf(x, y); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/sin_test.cpp b/libc/test/src/math/sin_test.cpp index d4c6bd416a409..4d5d9ddf464b1 100644 --- a/libc/test/src/math/sin_test.cpp +++ b/libc/test/src/math/sin_test.cpp @@ -71,7 +71,7 @@ TEST_F(LlvmLibcSinTest, InDoubleRange) { double x = FPBits(v).get_val(); if (FPBits(v).is_nan() || FPBits(v).is_inf()) continue; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; double result = LIBC_NAMESPACE::sin(x); ++cc; if (FPBits(result).is_nan() || FPBits(result).is_inf()) diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp index 2823110331f30..ad2155f329cd9 100644 --- a/libc/test/src/math/sincosf_test.cpp +++ b/libc/test/src/math/sincosf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sincosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -24,7 +24,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcSinCosfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float sin, cos; LIBC_NAMESPACE::sincosf(aNaN, &sin, &cos); diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp index 8fd3ed1577cee..e0357e6157fdc 100644 --- a/libc/test/src/math/sinf_test.cpp +++ b/libc/test/src/math/sinf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -24,7 +24,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcSinfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/sinhf_test.cpp b/libc/test/src/math/sinhf_test.cpp index 6867c7aec57df..74f906ebaa983 100644 --- a/libc/test/src/math/sinhf_test.cpp +++ b/libc/test/src/math/sinhf_test.cpp @@ -9,7 +9,7 @@ #include "hdr/math_macros.h" #include "src/__support/CPP/array.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -22,7 +22,7 @@ using LlvmLibcSinhfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcSinhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinhf(aNaN)); EXPECT_MATH_ERRNO(0); @@ -65,7 +65,7 @@ TEST_F(LlvmLibcSinhfTest, SmallValues) { } TEST_F(LlvmLibcSinhfTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::sinhf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/sinpif_test.cpp b/libc/test/src/math/sinpif_test.cpp index d00fd77d288c6..986c676761f0e 100644 --- a/libc/test/src/math/sinpif_test.cpp +++ b/libc/test/src/math/sinpif_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinpif.h" #include "test/UnitTest/FPMatcher.h" #include "test/src/math/sdcomp26094.h" @@ -21,7 +21,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcSinpifTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinpif(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/FModTest.h b/libc/test/src/math/smoke/FModTest.h index 8fbcc2a276542..04cbc659ece5d 100644 --- a/libc/test/src/math/smoke/FModTest.h +++ b/libc/test/src/math/smoke/FModTest.h @@ -10,7 +10,7 @@ #define LLVM_LIBC_TEST_SRC_MATH_FMODTEST_H #include "src/__support/FPUtil/FEnvImpl.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "test/UnitTest/FEnvSafeTest.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h index 6ae97ce35a0d6..745ccbc748ecd 100644 --- a/libc/test/src/math/smoke/RoundToIntegerTest.h +++ b/libc/test/src/math/smoke/RoundToIntegerTest.h @@ -40,7 +40,7 @@ class RoundToIntegerTestTemplate void test_one_input(RoundToIntegerFunc func, F input, I expected, bool expectError) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::fputil::clear_except(FE_ALL_EXCEPT); ASSERT_EQ(func(input), expected); diff --git a/libc/test/src/math/smoke/acos_test.cpp b/libc/test/src/math/smoke/acos_test.cpp index 3a59bce264077..fe2caefb52ab8 100644 --- a/libc/test/src/math/smoke/acos_test.cpp +++ b/libc/test/src/math/smoke/acos_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "hdr/fenv_macros.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acos.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ TEST_F(LlvmLibcAcosTest, SpecialNumbers) { EXPECT_FP_EQ(0x1.921fb54442d18p0, LIBC_NAMESPACE::acos(zero)); EXPECT_FP_EQ(0x1.921fb54442d18p0, LIBC_NAMESPACE::acos(neg_zero)); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::acos(inf), FE_INVALID); EXPECT_MATH_ERRNO(EDOM); diff --git a/libc/test/src/math/smoke/acosf16_test.cpp b/libc/test/src/math/smoke/acosf16_test.cpp index c4274b8245092..7103dc33fec3a 100644 --- a/libc/test/src/math/smoke/acosf16_test.cpp +++ b/libc/test/src/math/smoke/acosf16_test.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acosf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -15,7 +15,7 @@ using LlvmLibcAcosf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAcosf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::acosf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/acosf_test.cpp b/libc/test/src/math/smoke/acosf_test.cpp index 74f68e00011aa..257c6a3d1d22c 100644 --- a/libc/test/src/math/smoke/acosf_test.cpp +++ b/libc/test/src/math/smoke/acosf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcAcosfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAcosfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::acosf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/acoshf16_test.cpp b/libc/test/src/math/smoke/acoshf16_test.cpp index 7681c2a4e7fbc..6b9c995cf9921 100644 --- a/libc/test/src/math/smoke/acoshf16_test.cpp +++ b/libc/test/src/math/smoke/acoshf16_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acoshf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -15,7 +15,7 @@ using LlvmLibcAcoshf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAcoshf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::acoshf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/acoshf_test.cpp b/libc/test/src/math/smoke/acoshf_test.cpp index c5ba88055ac57..b6abfab999293 100644 --- a/libc/test/src/math/smoke/acoshf_test.cpp +++ b/libc/test/src/math/smoke/acoshf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acoshf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcAcoshfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAcoshfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::acoshf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/acospif16_test.cpp b/libc/test/src/math/smoke/acospif16_test.cpp index 66b94706eab94..4b2f6de3f7e37 100644 --- a/libc/test/src/math/smoke/acospif16_test.cpp +++ b/libc/test/src/math/smoke/acospif16_test.cpp @@ -6,14 +6,14 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/acospif16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" using LlvmLibcAcospif16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAcospif16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::acospif16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/asinf16_test.cpp b/libc/test/src/math/smoke/asinf16_test.cpp index 9f675b08319c0..b03f0a420a499 100644 --- a/libc/test/src/math/smoke/asinf16_test.cpp +++ b/libc/test/src/math/smoke/asinf16_test.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/asinf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -15,7 +15,7 @@ using LlvmLibcAsinf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAsinf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::asinf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/asinf_test.cpp b/libc/test/src/math/smoke/asinf_test.cpp index d817d2b366192..2615a8ddd16bd 100644 --- a/libc/test/src/math/smoke/asinf_test.cpp +++ b/libc/test/src/math/smoke/asinf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/asinf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcAsinfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAsinfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::asinf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/asinhf16_test.cpp b/libc/test/src/math/smoke/asinhf16_test.cpp index dcaab217331c7..7f612ce3c4674 100644 --- a/libc/test/src/math/smoke/asinhf16_test.cpp +++ b/libc/test/src/math/smoke/asinhf16_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/asinhf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -14,7 +14,7 @@ using LlvmLibcAsinhf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAsinhf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::asinhf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/asinhf_test.cpp b/libc/test/src/math/smoke/asinhf_test.cpp index 4a8743c50075f..d812a2dffe8aa 100644 --- a/libc/test/src/math/smoke/asinhf_test.cpp +++ b/libc/test/src/math/smoke/asinhf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/asinhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcAsinhfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAsinhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::asinhf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/atan2f_test.cpp b/libc/test/src/math/smoke/atan2f_test.cpp index 1fbcfbe96b2d7..7f8cfb9830d2a 100644 --- a/libc/test/src/math/smoke/atan2f_test.cpp +++ b/libc/test/src/math/smoke/atan2f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/atan2f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcAtan2fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtan2fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atan2f(sNaN, sNaN), FE_INVALID); diff --git a/libc/test/src/math/smoke/atanf16_test.cpp b/libc/test/src/math/smoke/atanf16_test.cpp index af50287d9b22a..ba1e3b2fc8bef 100644 --- a/libc/test/src/math/smoke/atanf16_test.cpp +++ b/libc/test/src/math/smoke/atanf16_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/atanf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -14,7 +14,7 @@ using LlvmLibcAtanf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtanf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::atanf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/atanf_test.cpp b/libc/test/src/math/smoke/atanf_test.cpp index 7d09a28beaa38..b56b9d0162b97 100644 --- a/libc/test/src/math/smoke/atanf_test.cpp +++ b/libc/test/src/math/smoke/atanf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/atanf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcAtanfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtanfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atanf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/atanhf16_test.cpp b/libc/test/src/math/smoke/atanhf16_test.cpp index 81df6da8cee26..c2a520f7638fe 100644 --- a/libc/test/src/math/smoke/atanhf16_test.cpp +++ b/libc/test/src/math/smoke/atanhf16_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/atanhf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -15,7 +15,7 @@ using LlvmLibcAtanhf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtanhf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::atanhf16(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/atanhf_test.cpp b/libc/test/src/math/smoke/atanhf_test.cpp index 73a5b81b0240b..038cb30d89a4e 100644 --- a/libc/test/src/math/smoke/atanhf_test.cpp +++ b/libc/test/src/math/smoke/atanhf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/atanhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -20,7 +20,7 @@ using LIBC_NAMESPACE::Sign; using LlvmLibcAtanhfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcAtanhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::atanhf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); // TODO: Strengthen errno,exception checks and remove these assert macros diff --git a/libc/test/src/math/smoke/cosf16_test.cpp b/libc/test/src/math/smoke/cosf16_test.cpp index 2638551fb1d1b..4362a5a3a4bd1 100644 --- a/libc/test/src/math/smoke/cosf16_test.cpp +++ b/libc/test/src/math/smoke/cosf16_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/cosf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -14,7 +14,7 @@ using LlvmLibcCosf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCosf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cosf16(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/cosf_test.cpp b/libc/test/src/math/smoke/cosf_test.cpp index 99773583dcb10..470a876c63a75 100644 --- a/libc/test/src/math/smoke/cosf_test.cpp +++ b/libc/test/src/math/smoke/cosf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/cosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcCosfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCosfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cosf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/coshf16_test.cpp b/libc/test/src/math/smoke/coshf16_test.cpp index 08d05ecce86ba..7bf62afa24c43 100644 --- a/libc/test/src/math/smoke/coshf16_test.cpp +++ b/libc/test/src/math/smoke/coshf16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/coshf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcCoshf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCoshf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::coshf16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcCoshf16Test, SpecialNumbers) { } TEST_F(LlvmLibcCoshf16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::coshf16(max_normal), FE_OVERFLOW | FE_INEXACT); diff --git a/libc/test/src/math/smoke/coshf_test.cpp b/libc/test/src/math/smoke/coshf_test.cpp index 1611ea1b92926..ee8f0199df3b0 100644 --- a/libc/test/src/math/smoke/coshf_test.cpp +++ b/libc/test/src/math/smoke/coshf_test.cpp @@ -9,7 +9,7 @@ #include "hdr/math_macros.h" #include "src/__support/CPP/array.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/coshf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -19,7 +19,7 @@ using LlvmLibcCoshfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCoshfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::coshf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -41,7 +41,7 @@ TEST_F(LlvmLibcCoshfTest, SpecialNumbers) { } TEST_F(LlvmLibcCoshfTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::coshf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/smoke/cospif16_test.cpp b/libc/test/src/math/smoke/cospif16_test.cpp index edd8ed97b30f6..fcde0cc79e356 100644 --- a/libc/test/src/math/smoke/cospif16_test.cpp +++ b/libc/test/src/math/smoke/cospif16_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/cospif16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -15,7 +15,7 @@ using LlvmLibcCospif16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCospif16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cospif16(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/cospif_test.cpp b/libc/test/src/math/smoke/cospif_test.cpp index 20153897dc459..3d48909cca93e 100644 --- a/libc/test/src/math/smoke/cospif_test.cpp +++ b/libc/test/src/math/smoke/cospif_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/cospif.h" #include "test/UnitTest/FPMatcher.h" @@ -15,7 +15,7 @@ using LlvmLibcCospifTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcCospifTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::cospif(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/exp10_test.cpp b/libc/test/src/math/smoke/exp10_test.cpp index baf8a76810970..50d3de0c7fe75 100644 --- a/libc/test/src/math/smoke/exp10_test.cpp +++ b/libc/test/src/math/smoke/exp10_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/exp10f16_test.cpp b/libc/test/src/math/smoke/exp10f16_test.cpp index 1c4ef2aa08a70..bda40348f8832 100644 --- a/libc/test/src/math/smoke/exp10f16_test.cpp +++ b/libc/test/src/math/smoke/exp10f16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcExp10f16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp10f16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp10f16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp10f16Test, SpecialNumbers) { } TEST_F(LlvmLibcExp10f16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10f16(max_normal), FE_OVERFLOW); @@ -53,7 +53,7 @@ TEST_F(LlvmLibcExp10f16Test, Overflow) { } TEST_F(LlvmLibcExp10f16Test, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(zero, LIBC_NAMESPACE::exp10f16(neg_max_normal), FE_UNDERFLOW | FE_INEXACT); diff --git a/libc/test/src/math/smoke/exp10f_test.cpp b/libc/test/src/math/smoke/exp10f_test.cpp index bf39e2cc12d0c..fcd334bb9e364 100644 --- a/libc/test/src/math/smoke/exp10f_test.cpp +++ b/libc/test/src/math/smoke/exp10f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcExp10fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp10fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10f(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -44,7 +44,7 @@ TEST_F(LlvmLibcExp10fTest, SpecialNumbers) { } TEST_F(LlvmLibcExp10fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::exp10f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/smoke/exp10m1f16_test.cpp b/libc/test/src/math/smoke/exp10m1f16_test.cpp index dfa7fa477d3d1..ed2d5a48b3165 100644 --- a/libc/test/src/math/smoke/exp10m1f16_test.cpp +++ b/libc/test/src/math/smoke/exp10m1f16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10m1f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcExp10m1f16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp10m1f16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp10m1f16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp10m1f16Test, SpecialNumbers) { } TEST_F(LlvmLibcExp10m1f16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10m1f16(max_normal), FE_OVERFLOW | FE_INEXACT); @@ -67,7 +67,7 @@ TEST_F(LlvmLibcExp10m1f16Test, Overflow) { } TEST_F(LlvmLibcExp10m1f16Test, ResultNearNegOne) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast(-1.0), LIBC_NAMESPACE::exp10m1f16(neg_max_normal), diff --git a/libc/test/src/math/smoke/exp10m1f_test.cpp b/libc/test/src/math/smoke/exp10m1f_test.cpp index 2c2cfdbb08a3f..19369a897aaa9 100644 --- a/libc/test/src/math/smoke/exp10m1f_test.cpp +++ b/libc/test/src/math/smoke/exp10m1f_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp10m1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -14,7 +14,7 @@ using LlvmLibcExp10m1fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp10m1fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10m1f(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -34,7 +34,7 @@ TEST_F(LlvmLibcExp10m1fTest, SpecialNumbers) { } TEST_F(LlvmLibcExp10m1fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10m1f(0x1.fffffep+127f), FE_OVERFLOW); @@ -50,7 +50,7 @@ TEST_F(LlvmLibcExp10m1fTest, Overflow) { } TEST_F(LlvmLibcExp10m1fTest, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(-1.0f, LIBC_NAMESPACE::exp10m1f(-max_normal), FE_UNDERFLOW); diff --git a/libc/test/src/math/smoke/exp2_test.cpp b/libc/test/src/math/smoke/exp2_test.cpp index 9ab9129416dad..aebf808350727 100644 --- a/libc/test/src/math/smoke/exp2_test.cpp +++ b/libc/test/src/math/smoke/exp2_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/exp2f16_test.cpp b/libc/test/src/math/smoke/exp2f16_test.cpp index f69b33a3cf37f..1eb7343dcd22f 100644 --- a/libc/test/src/math/smoke/exp2f16_test.cpp +++ b/libc/test/src/math/smoke/exp2f16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcExp2f16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp2f16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp2f16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExp2f16Test, SpecialNumbers) { } TEST_F(LlvmLibcExp2f16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2f16(max_normal), FE_OVERFLOW); @@ -53,7 +53,7 @@ TEST_F(LlvmLibcExp2f16Test, Overflow) { } TEST_F(LlvmLibcExp2f16Test, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(zero, LIBC_NAMESPACE::exp2f16(neg_max_normal), FE_UNDERFLOW | FE_INEXACT); diff --git a/libc/test/src/math/smoke/exp2f_test.cpp b/libc/test/src/math/smoke/exp2f_test.cpp index a928389cc41b4..c5243273d9ed4 100644 --- a/libc/test/src/math/smoke/exp2f_test.cpp +++ b/libc/test/src/math/smoke/exp2f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcExp2fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp2fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp2f(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -45,7 +45,7 @@ TEST_F(LlvmLibcExp2fTest, SpecialNumbers) { } TEST_F(LlvmLibcExp2fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::exp2f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/smoke/exp2m1f16_test.cpp b/libc/test/src/math/smoke/exp2m1f16_test.cpp index f423196a70360..635b7a6e187d7 100644 --- a/libc/test/src/math/smoke/exp2m1f16_test.cpp +++ b/libc/test/src/math/smoke/exp2m1f16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2m1f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcExp2m1f16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExp2m1f16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp2m1f16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -39,7 +39,7 @@ TEST_F(LlvmLibcExp2m1f16Test, SpecialNumbers) { } TEST_F(LlvmLibcExp2m1f16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2m1f16(max_normal), FE_OVERFLOW | FE_INEXACT); @@ -65,7 +65,7 @@ TEST_F(LlvmLibcExp2m1f16Test, Overflow) { } TEST_F(LlvmLibcExp2m1f16Test, ResultNearNegOne) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(-1.0, LIBC_NAMESPACE::exp2m1f16(neg_max_normal), FE_INEXACT); diff --git a/libc/test/src/math/smoke/exp2m1f_test.cpp b/libc/test/src/math/smoke/exp2m1f_test.cpp index 99bdf0035df0c..63852e11655ad 100644 --- a/libc/test/src/math/smoke/exp2m1f_test.cpp +++ b/libc/test/src/math/smoke/exp2m1f_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp2m1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LIBC_NAMESPACE::fputil::testing::ForceRoundingMode; using LIBC_NAMESPACE::fputil::testing::RoundingMode; TEST_F(LlvmLibcExp2m1fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp2m1f(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -34,7 +34,7 @@ TEST_F(LlvmLibcExp2m1fTest, SpecialNumbers) { } TEST_F(LlvmLibcExp2m1fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp2m1f(0x1.fffffep+127), FE_OVERFLOW); @@ -50,7 +50,7 @@ TEST_F(LlvmLibcExp2m1fTest, Overflow) { } TEST_F(LlvmLibcExp2m1fTest, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(-1.0f, LIBC_NAMESPACE::exp2m1f(-0x1.fffffep+127), FE_UNDERFLOW); diff --git a/libc/test/src/math/smoke/exp_test.cpp b/libc/test/src/math/smoke/exp_test.cpp index f86243092f1fb..c3b2ae70e1d99 100644 --- a/libc/test/src/math/smoke/exp_test.cpp +++ b/libc/test/src/math/smoke/exp_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/exp.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/expf16_test.cpp b/libc/test/src/math/smoke/expf16_test.cpp index ab745a3cf6f56..863f694ffc41a 100644 --- a/libc/test/src/math/smoke/expf16_test.cpp +++ b/libc/test/src/math/smoke/expf16_test.cpp @@ -9,7 +9,7 @@ #include "hdr/errno_macros.h" #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -17,7 +17,7 @@ using LlvmLibcExpf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExpf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::expf16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -41,7 +41,7 @@ TEST_F(LlvmLibcExpf16Test, SpecialNumbers) { } TEST_F(LlvmLibcExpf16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::expf16(max_normal), FE_OVERFLOW); @@ -54,7 +54,7 @@ TEST_F(LlvmLibcExpf16Test, Overflow) { } TEST_F(LlvmLibcExpf16Test, Underflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(zero, LIBC_NAMESPACE::expf16(neg_max_normal), FE_UNDERFLOW | FE_INEXACT); diff --git a/libc/test/src/math/smoke/expf_test.cpp b/libc/test/src/math/smoke/expf_test.cpp index eee8304999275..d34151735afa7 100644 --- a/libc/test/src/math/smoke/expf_test.cpp +++ b/libc/test/src/math/smoke/expf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcExpfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExpfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::expf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpfTest, SpecialNumbers) { } TEST_F(LlvmLibcExpfTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::expf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/smoke/expm1_test.cpp b/libc/test/src/math/smoke/expm1_test.cpp index bc71c53abc7ac..c842fe3c45fe1 100644 --- a/libc/test/src/math/smoke/expm1_test.cpp +++ b/libc/test/src/math/smoke/expm1_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expm1.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/expm1f16_test.cpp b/libc/test/src/math/smoke/expm1f16_test.cpp index f297c5dfc3c7e..4d19a9bac5eb1 100644 --- a/libc/test/src/math/smoke/expm1f16_test.cpp +++ b/libc/test/src/math/smoke/expm1f16_test.cpp @@ -9,7 +9,7 @@ #include "hdr/errno_macros.h" #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expm1f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -17,7 +17,7 @@ using LlvmLibcExpm1f16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExpm1f16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::expm1f16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpm1f16Test, SpecialNumbers) { } TEST_F(LlvmLibcExpm1f16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::expm1f16(max_normal), FE_OVERFLOW | FE_INEXACT); @@ -67,7 +67,7 @@ TEST_F(LlvmLibcExpm1f16Test, Overflow) { } TEST_F(LlvmLibcExpm1f16Test, ResultNearNegOne) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast(-1.0), LIBC_NAMESPACE::expm1f16(neg_max_normal), diff --git a/libc/test/src/math/smoke/expm1f_test.cpp b/libc/test/src/math/smoke/expm1f_test.cpp index dfb474d70fb6a..214bfe8abd4d2 100644 --- a/libc/test/src/math/smoke/expm1f_test.cpp +++ b/libc/test/src/math/smoke/expm1f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/expm1f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcExpm1fTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::expm1f(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) { } TEST_F(LlvmLibcExpm1fTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::expm1f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/smoke/log10_test.cpp b/libc/test/src/math/smoke/log10_test.cpp index ff73850c52101..49cfda85111a5 100644 --- a/libc/test/src/math/smoke/log10_test.cpp +++ b/libc/test/src/math/smoke/log10_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log10.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/log10f16_test.cpp b/libc/test/src/math/smoke/log10f16_test.cpp index 471e198933326..53f5ac46aa60f 100644 --- a/libc/test/src/math/smoke/log10f16_test.cpp +++ b/libc/test/src/math/smoke/log10f16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log10f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcLog10f16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLog10f16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::log10f16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/log1p_test.cpp b/libc/test/src/math/smoke/log1p_test.cpp index 631c24b8abcf9..61c56cd2c6ddd 100644 --- a/libc/test/src/math/smoke/log1p_test.cpp +++ b/libc/test/src/math/smoke/log1p_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log1p.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/log1pf_test.cpp b/libc/test/src/math/smoke/log1pf_test.cpp index bd828ad58c4c9..dc3489fddf99f 100644 --- a/libc/test/src/math/smoke/log1pf_test.cpp +++ b/libc/test/src/math/smoke/log1pf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log1pf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/log2_test.cpp b/libc/test/src/math/smoke/log2_test.cpp index 9993d442967cb..0534d00b1f408 100644 --- a/libc/test/src/math/smoke/log2_test.cpp +++ b/libc/test/src/math/smoke/log2_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log2.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/log2f16_test.cpp b/libc/test/src/math/smoke/log2f16_test.cpp index 6d98482aa4499..fd20652d2f008 100644 --- a/libc/test/src/math/smoke/log2f16_test.cpp +++ b/libc/test/src/math/smoke/log2f16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log2f16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcLog2f16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLog2f16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::log2f16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/log2f_test.cpp b/libc/test/src/math/smoke/log2f_test.cpp index 8648b75b88b83..53d54ac367639 100644 --- a/libc/test/src/math/smoke/log2f_test.cpp +++ b/libc/test/src/math/smoke/log2f_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log2f.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/log_test.cpp b/libc/test/src/math/smoke/log_test.cpp index d31eb0c1db734..09e9ab0a9a4d8 100644 --- a/libc/test/src/math/smoke/log_test.cpp +++ b/libc/test/src/math/smoke/log_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/log.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/math/smoke/logf16_test.cpp b/libc/test/src/math/smoke/logf16_test.cpp index c7232aa1c1e32..2784f3d5fa54d 100644 --- a/libc/test/src/math/smoke/logf16_test.cpp +++ b/libc/test/src/math/smoke/logf16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/logf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcLogf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcLogf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::logf16(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/sincosf_test.cpp b/libc/test/src/math/smoke/sincosf_test.cpp index 5f66868f12a1c..8ba0d04347bba 100644 --- a/libc/test/src/math/smoke/sincosf_test.cpp +++ b/libc/test/src/math/smoke/sincosf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sincosf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcSinCosfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinCosfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; float sin, cos; LIBC_NAMESPACE::sincosf(sNaN, &sin, &cos); diff --git a/libc/test/src/math/smoke/sinf16_test.cpp b/libc/test/src/math/smoke/sinf16_test.cpp index a0e7a7ba321fd..6b168ac040db9 100644 --- a/libc/test/src/math/smoke/sinf16_test.cpp +++ b/libc/test/src/math/smoke/sinf16_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -14,7 +14,7 @@ using LlvmLibcSinf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinf16(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/sinf_test.cpp b/libc/test/src/math/smoke/sinf_test.cpp index de504b4f5335c..8173969fb2569 100644 --- a/libc/test/src/math/smoke/sinf_test.cpp +++ b/libc/test/src/math/smoke/sinf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcSinfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/sinhf16_test.cpp b/libc/test/src/math/smoke/sinhf16_test.cpp index 4f21d33ba78e0..d52739a9adb35 100644 --- a/libc/test/src/math/smoke/sinhf16_test.cpp +++ b/libc/test/src/math/smoke/sinhf16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinhf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcSinhf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinhf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::sinhf16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -38,7 +38,7 @@ TEST_F(LlvmLibcSinhf16Test, SpecialNumbers) { } TEST_F(LlvmLibcSinhf16Test, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::sinhf16(max_normal), FE_OVERFLOW | FE_INEXACT); diff --git a/libc/test/src/math/smoke/sinhf_test.cpp b/libc/test/src/math/smoke/sinhf_test.cpp index e22cfc7ea14d8..ea6a4474a7806 100644 --- a/libc/test/src/math/smoke/sinhf_test.cpp +++ b/libc/test/src/math/smoke/sinhf_test.cpp @@ -9,7 +9,7 @@ #include "hdr/math_macros.h" #include "src/__support/CPP/array.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -19,7 +19,7 @@ using LlvmLibcSinhfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinhf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); @@ -52,7 +52,7 @@ TEST_F(LlvmLibcSinhfTest, SmallValues) { } TEST_F(LlvmLibcSinhfTest, Overflow) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION( inf, LIBC_NAMESPACE::sinhf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW); EXPECT_MATH_ERRNO(ERANGE); diff --git a/libc/test/src/math/smoke/sinpif16_test.cpp b/libc/test/src/math/smoke/sinpif16_test.cpp index b2db6fb9f8626..9edf2cc663d4b 100644 --- a/libc/test/src/math/smoke/sinpif16_test.cpp +++ b/libc/test/src/math/smoke/sinpif16_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinpif16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -15,7 +15,7 @@ using LlvmLibcSinpif16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinpif16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinpif16(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/sinpif_test.cpp b/libc/test/src/math/smoke/sinpif_test.cpp index 1ba5c1d2b720a..b840f3980eda2 100644 --- a/libc/test/src/math/smoke/sinpif_test.cpp +++ b/libc/test/src/math/smoke/sinpif_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/sinpif.h" #include "test/UnitTest/FPMatcher.h" @@ -15,7 +15,7 @@ using LlvmLibcSinpifTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcSinpifTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinpif(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/tanf16_test.cpp b/libc/test/src/math/smoke/tanf16_test.cpp index f65b9fced72c4..95d200cf5591d 100644 --- a/libc/test/src/math/smoke/tanf16_test.cpp +++ b/libc/test/src/math/smoke/tanf16_test.cpp @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/tanf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -15,7 +15,7 @@ using LlvmLibcTanf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcTanf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanf16(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/tanf_test.cpp b/libc/test/src/math/smoke/tanf_test.cpp index 178e9065f430f..12deca5cf9417 100644 --- a/libc/test/src/math/smoke/tanf_test.cpp +++ b/libc/test/src/math/smoke/tanf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/tanf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcTanfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcTanfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/tanhf16_test.cpp b/libc/test/src/math/smoke/tanhf16_test.cpp index fa6328e9ef0a6..eb90f02a8d7c3 100644 --- a/libc/test/src/math/smoke/tanhf16_test.cpp +++ b/libc/test/src/math/smoke/tanhf16_test.cpp @@ -8,7 +8,7 @@ #include "hdr/fenv_macros.h" #include "src/__support/FPUtil/cast.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/tanhf16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -16,7 +16,7 @@ using LlvmLibcTanhf16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcTanhf16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::tanhf16(aNaN)); EXPECT_MATH_ERRNO(0); @@ -40,7 +40,7 @@ TEST_F(LlvmLibcTanhf16Test, SpecialNumbers) { } TEST_F(LlvmLibcTanhf16Test, ResultNearBounds) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast(1.0), LIBC_NAMESPACE::tanhf16(max_normal), FE_INEXACT); diff --git a/libc/test/src/math/smoke/tanhf_test.cpp b/libc/test/src/math/smoke/tanhf_test.cpp index c09761ef531f2..b12a331b31906 100644 --- a/libc/test/src/math/smoke/tanhf_test.cpp +++ b/libc/test/src/math/smoke/tanhf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/tanhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -18,7 +18,7 @@ using LlvmLibcTanhfTest = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcTanhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanhf(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/smoke/tanpif16_test.cpp b/libc/test/src/math/smoke/tanpif16_test.cpp index 74797d1649b1a..ea896d7bb3e57 100644 --- a/libc/test/src/math/smoke/tanpif16_test.cpp +++ b/libc/test/src/math/smoke/tanpif16_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/tanpif16.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -14,7 +14,7 @@ using LlvmLibcTanpif16Test = LIBC_NAMESPACE::testing::FPTest; TEST_F(LlvmLibcTanpif16Test, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::tanpif16(sNaN), FE_INVALID); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/tanf_test.cpp b/libc/test/src/math/tanf_test.cpp index 9061cf6fb30b8..ecc70194b6491 100644 --- a/libc/test/src/math/tanf_test.cpp +++ b/libc/test/src/math/tanf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/tanf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -24,7 +24,7 @@ using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcTanfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/math/tanhf_test.cpp b/libc/test/src/math/tanhf_test.cpp index 389abe4d85897..966ce649e2b38 100644 --- a/libc/test/src/math/tanhf_test.cpp +++ b/libc/test/src/math/tanhf_test.cpp @@ -8,7 +8,7 @@ #include "hdr/math_macros.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/math/tanhf.h" #include "test/UnitTest/FPMatcher.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ using LlvmLibcTanhfTest = LIBC_NAMESPACE::testing::FPTest; namespace mpfr = LIBC_NAMESPACE::testing::mpfr; TEST_F(LlvmLibcTanhfTest, SpecialNumbers) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::tanhf(aNaN)); EXPECT_MATH_ERRNO(0); diff --git a/libc/test/src/poll/poll_test.cpp b/libc/test/src/poll/poll_test.cpp index 30f5e41c61ecf..97b7b02718172 100644 --- a/libc/test/src/poll/poll_test.cpp +++ b/libc/test/src/poll/poll_test.cpp @@ -7,18 +7,18 @@ //===----------------------------------------------------------------------===// #include "hdr/limits_macros.h" // UINT_MAX -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/poll/poll.h" #include "test/UnitTest/Test.h" TEST(LlvmLibcPollTest, SmokeTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; int ret = LIBC_NAMESPACE::poll(nullptr, 0, 0); ASSERT_ERRNO_SUCCESS(); ASSERT_EQ(0, ret); } TEST(LlvmLibcPollTest, SmokeFailureTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; int ret = LIBC_NAMESPACE::poll(nullptr, UINT_MAX, 0); ASSERT_ERRNO_EQ(EINVAL); ASSERT_EQ(-1, ret); diff --git a/libc/test/src/sched/affinity_test.cpp b/libc/test/src/sched/affinity_test.cpp index b5085203e5ce0..b77f22f8e60d2 100644 --- a/libc/test/src/sched/affinity_test.cpp +++ b/libc/test/src/sched/affinity_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/OSUtil/syscall.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sched/sched_getaffinity.h" #include "src/sched/sched_setaffinity.h" #include "test/UnitTest/ErrnoSetterMatcher.h" @@ -17,7 +17,7 @@ TEST(LlvmLibcSchedAffinityTest, SmokeTest) { cpu_set_t mask; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; pid_t tid = LIBC_NAMESPACE::syscall_impl(SYS_gettid); ASSERT_GT(tid, pid_t(0)); @@ -32,15 +32,15 @@ TEST(LlvmLibcSchedAffinityTest, BadMask) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; pid_t tid = LIBC_NAMESPACE::syscall_impl(SYS_gettid); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT( LIBC_NAMESPACE::sched_getaffinity(tid, sizeof(cpu_set_t), nullptr), Fails(EFAULT)); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT( LIBC_NAMESPACE::sched_setaffinity(tid, sizeof(cpu_set_t), nullptr), Fails(EFAULT)); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; } diff --git a/libc/test/src/sched/cpu_count_test.cpp b/libc/test/src/sched/cpu_count_test.cpp index 5250368a26162..919f1475e1d4d 100644 --- a/libc/test/src/sched/cpu_count_test.cpp +++ b/libc/test/src/sched/cpu_count_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/OSUtil/syscall.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sched/sched_getaffinity.h" #include "src/sched/sched_getcpucount.h" #include "test/UnitTest/ErrnoSetterMatcher.h" @@ -17,7 +17,7 @@ TEST(LlvmLibcSchedCpuCountTest, SmokeTest) { cpu_set_t mask; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; pid_t tid = LIBC_NAMESPACE::syscall_impl(SYS_gettid); ASSERT_GT(tid, pid_t(0)); diff --git a/libc/test/src/sched/get_priority_test.cpp b/libc/test/src/sched/get_priority_test.cpp index 59205c51e4a16..bb41dc0be2019 100644 --- a/libc/test/src/sched/get_priority_test.cpp +++ b/libc/test/src/sched/get_priority_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sched/sched_get_priority_max.h" #include "src/sched/sched_get_priority_min.h" #include "test/UnitTest/Test.h" @@ -58,7 +58,7 @@ TEST(LlvmLibcSchedGetPriorityTest, HandleBadPolicyTest) { } TEST(LlvmLibcSchedGetPriorityTest, SmokeTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // We Test: // SCHED_OTHER, SCHED_FIFO, SCHED_RR diff --git a/libc/test/src/sched/param_and_scheduler_test.cpp b/libc/test/src/sched/param_and_scheduler_test.cpp index 747c7e3409e41..4f2b6e412a4b7 100644 --- a/libc/test/src/sched/param_and_scheduler_test.cpp +++ b/libc/test/src/sched/param_and_scheduler_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sched/sched_get_priority_max.h" #include "src/sched/sched_get_priority_min.h" #include "src/sched/sched_getparam.h" @@ -37,7 +37,7 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test { public: void testSched(int policy, bool is_mandatory) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; int init_policy = LIBC_NAMESPACE::sched_getscheduler(0); ASSERT_GE(init_policy, 0); @@ -55,30 +55,29 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test { // Negative pid ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(-1, policy, ¶m), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::sched_getscheduler(-1), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // Invalid Policy ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy | 128, ¶m), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // Out of bounds priority param.sched_priority = min_priority - 1; ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, ¶m), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; param.sched_priority = max_priority + 1; ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, ¶m), -1); // A bit hard to test as depending on user privileges we can run into // different issues. - ASSERT_TRUE(LIBC_NAMESPACE::libc_errno == EINVAL || - LIBC_NAMESPACE::libc_errno == EPERM); - LIBC_NAMESPACE::libc_errno = 0; + ASSERT_TRUE(libc_errno == EINVAL || libc_errno == EPERM); + libc_errno = 0; param.sched_priority = min_priority; // Success/unsupported policy/missing permissions. @@ -87,10 +86,9 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test { ASSERT_TRUE(setscheduler_result == 0 || setscheduler_result == -1); ASSERT_TRUE( setscheduler_result != -1 - ? (LIBC_NAMESPACE::libc_errno == 0) - : ((!is_mandatory && LIBC_NAMESPACE::libc_errno == EINVAL) || - LIBC_NAMESPACE::libc_errno == EPERM)); - LIBC_NAMESPACE::libc_errno = 0; + ? (libc_errno == 0) + : ((!is_mandatory && libc_errno == EINVAL) || libc_errno == EPERM)); + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::sched_getscheduler(0), setscheduler_result != -1 ? policy : init_policy); @@ -100,12 +98,12 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test { param.sched_priority = -1; ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, ¶m), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; param.sched_priority = max_priority + 1; ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, ¶m), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; for (int priority = min_priority; priority <= max_priority; ++priority) { ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(0, ¶m), 0); @@ -117,21 +115,20 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test { // Negative pid ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(-1, ¶m), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(-1, ¶m), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // Success/unsupported policy/missing permissions int setparam_result = LIBC_NAMESPACE::sched_setparam(0, ¶m); ASSERT_TRUE(setparam_result == 0 || setparam_result == -1); ASSERT_TRUE(setparam_result != -1 - ? (LIBC_NAMESPACE::libc_errno == 0) - : ((setscheduler_result == -1 && - LIBC_NAMESPACE::libc_errno == EINVAL) || - LIBC_NAMESPACE::libc_errno == EPERM)); - LIBC_NAMESPACE::libc_errno = 0; + ? (libc_errno == 0) + : ((setscheduler_result == -1 && libc_errno == EINVAL) || + libc_errno == EPERM)); + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(0, ¶m), 0); ASSERT_ERRNO_SUCCESS(); @@ -143,7 +140,7 @@ class SchedTest : public LIBC_NAMESPACE::testing::Test { // Null test ASSERT_EQ(LIBC_NAMESPACE::sched_setscheduler(0, policy, nullptr), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; } }; @@ -161,13 +158,13 @@ LIST_SCHED_TESTS(SCHED_BATCH, true) LIST_SCHED_TESTS(SCHED_IDLE, true) TEST(LlvmLibcSchedParamAndSchedulerTest, NullParamTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::sched_setparam(0, nullptr), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::sched_getparam(0, nullptr), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; } diff --git a/libc/test/src/sched/sched_rr_get_interval_test.cpp b/libc/test/src/sched/sched_rr_get_interval_test.cpp index c22a2c76d743c..a0fe5edbe014e 100644 --- a/libc/test/src/sched/sched_rr_get_interval_test.cpp +++ b/libc/test/src/sched/sched_rr_get_interval_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sched/sched_get_priority_min.h" #include "src/sched/sched_getscheduler.h" #include "src/sched/sched_rr_get_interval.h" @@ -17,7 +17,7 @@ #include TEST(LlvmLibcSchedRRGetIntervalTest, SmokeTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; auto SetSched = [&](int policy) { int min_priority = LIBC_NAMESPACE::sched_get_priority_min(policy); ASSERT_GE(min_priority, 0); @@ -58,19 +58,19 @@ TEST(LlvmLibcSchedRRGetIntervalTest, SmokeTest) { // Null timespec ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(0, nullptr), -1); ASSERT_ERRNO_EQ(EFAULT); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // Negative pid ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(-1, &ts), -1); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; } // Negative tests don't have SCHED_RR set SetSched(SCHED_OTHER); ASSERT_EQ(LIBC_NAMESPACE::sched_rr_get_interval(0, &ts), 0); ASSERT_ERRNO_SUCCESS(); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // TODO: Missing unkown pid -> ESRCH. This is read only so safe to try a few // unlikely values. diff --git a/libc/test/src/sched/yield_test.cpp b/libc/test/src/sched/yield_test.cpp index f1627a71fa9ad..4d13d50e25eb2 100644 --- a/libc/test/src/sched/yield_test.cpp +++ b/libc/test/src/sched/yield_test.cpp @@ -6,12 +6,12 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sched/sched_yield.h" #include "test/UnitTest/Test.h" TEST(LlvmLibcSchedYieldTest, SmokeTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; // sched_yield() always succeeds, just do a basic test that errno/ret are // properly 0. ASSERT_EQ(LIBC_NAMESPACE::sched_yield(), 0); diff --git a/libc/test/src/signal/sigaltstack_test.cpp b/libc/test/src/signal/sigaltstack_test.cpp index cc392da8f4731..ce4dfddae2481 100644 --- a/libc/test/src/signal/sigaltstack_test.cpp +++ b/libc/test/src/signal/sigaltstack_test.cpp @@ -8,7 +8,7 @@ #include "hdr/signal_macros.h" #include "src/__support/OSUtil/syscall.h" // For internal syscall function. -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/signal/linux/signal_utils.h" #include "src/signal/raise.h" #include "src/signal/sigaction.h" @@ -46,7 +46,7 @@ static void handler(int) { TEST(LlvmLibcSignalTest, SigaltstackRunOnAltStack) { struct sigaction action; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::sigaction(SIGUSR1, nullptr, &action), Succeeds(0)); action.sa_handler = handler; diff --git a/libc/test/src/signal/signal_test.cpp b/libc/test/src/signal/signal_test.cpp index bac9c3b8b68bb..62b86bf440291 100644 --- a/libc/test/src/signal/signal_test.cpp +++ b/libc/test/src/signal/signal_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/signal/raise.h" #include "src/signal/signal.h" @@ -17,7 +17,7 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; TEST(LlvmLibcSignal, Invalid) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; auto *valid = +[](int) {}; EXPECT_THAT((void *)LIBC_NAMESPACE::signal(0, valid), Fails(EINVAL, (void *)SIG_ERR)); diff --git a/libc/test/src/signal/sigprocmask_test.cpp b/libc/test/src/signal/sigprocmask_test.cpp index 12403f68b5930..891eac0f5bf75 100644 --- a/libc/test/src/signal/sigprocmask_test.cpp +++ b/libc/test/src/signal/sigprocmask_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/signal/raise.h" #include "src/signal/sigaddset.h" #include "src/signal/sigemptyset.h" @@ -33,7 +33,7 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; // This tests for invalid input. TEST_F(LlvmLibcSignalTest, SigprocmaskInvalid) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; sigset_t valid; // 17 and -4 are out of the range for sigprocmask's how paramater. diff --git a/libc/test/src/spawn/posix_spawn_file_actions_test.cpp b/libc/test/src/spawn/posix_spawn_file_actions_test.cpp index c1edf56bdbd87..01ccb8218ee20 100644 --- a/libc/test/src/spawn/posix_spawn_file_actions_test.cpp +++ b/libc/test/src/spawn/posix_spawn_file_actions_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/spawn/file_actions.h" #include "src/spawn/posix_spawn_file_actions_addclose.h" #include "src/spawn/posix_spawn_file_actions_adddup2.h" diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp index ef36cff2ffbd5..104fc478b100e 100644 --- a/libc/test/src/stdio/fdopen_test.cpp +++ b/libc/test/src/stdio/fdopen_test.cpp @@ -9,7 +9,7 @@ #include "src/stdio/fdopen.h" #include "hdr/fcntl_macros.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/fclose.h" #include "src/stdio/fgets.h" @@ -22,7 +22,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); @@ -53,7 +53,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) { } TEST(LlvmLibcStdioFdopenTest, InvalidFd) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC); @@ -65,7 +65,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidFd) { } TEST(LlvmLibcStdioFdopenTest, InvalidMode) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU); @@ -83,7 +83,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidMode) { auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w"); ASSERT_ERRNO_EQ(EINVAL); ASSERT_TRUE(nullptr == fp2); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::close(fd); ASSERT_ERRNO_SUCCESS(); } diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp index 2cc8436bd66f2..56bde5f0099a8 100644 --- a/libc/test/src/stdio/fgetc_test.cpp +++ b/libc/test/src/stdio/fgetc_test.cpp @@ -17,7 +17,7 @@ #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { public: @@ -33,7 +33,7 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp index 46cf12c2c253b..90429ecf4e82b 100644 --- a/libc/test/src/stdio/fgetc_unlocked_test.cpp +++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp @@ -20,7 +20,7 @@ #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { public: @@ -36,7 +36,7 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp index a8a2c62f07b5e..abed3d4052939 100644 --- a/libc/test/src/stdio/fgets_test.cpp +++ b/libc/test/src/stdio/fgets_test.cpp @@ -14,7 +14,7 @@ #include "src/stdio/fwrite.h" #include "test/UnitTest/Test.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) { constexpr char FILENAME[] = "testdata/fgets.test"; @@ -35,7 +35,7 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp index a0368d701a676..e624181c795b8 100644 --- a/libc/test/src/stdio/fileop_test.cpp +++ b/libc/test/src/stdio/fileop_test.cpp @@ -21,7 +21,7 @@ #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE; @@ -41,7 +41,7 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file), returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); @@ -72,7 +72,7 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file), returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); @@ -80,15 +80,15 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file), returns(EQ(EOF)).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file), returns(EQ(size_t(0))).with_errno(NE(0))); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); @@ -103,10 +103,10 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); // This is not a readable file. - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file), returns(EQ(0)).with_errno(NE(0))); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); @@ -121,15 +121,15 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { // Check that the other functions correctly set libc_errno. - // LIBC_NAMESPACE::libc_errno = 0; + // libc_errno = 0; // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0); // ASSERT_ERRNO_FAILURE(); - // LIBC_NAMESPACE::libc_errno = 0; + // libc_errno = 0; // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0); // ASSERT_ERRNO_FAILURE(); - // LIBC_NAMESPACE::libc_errno = 0; + // libc_errno = 0; // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"), // static_cast(nullptr)); // ASSERT_ERRNO_FAILURE(); @@ -165,7 +165,7 @@ TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct); constexpr char FILENAME[] = "testdata/fread_fwrite.test"; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file)); diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp index 61ce2a207fa19..03e1ac286b646 100644 --- a/libc/test/src/stdio/fopencookie_test.cpp +++ b/libc/test/src/stdio/fopencookie_test.cpp @@ -20,7 +20,7 @@ #include "hdr/stdio_macros.h" #include "hdr/types/size_t.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" using MemoryView = LIBC_NAMESPACE::testing::MemoryView; @@ -67,7 +67,7 @@ int seek_ss(void *cookie, off64_t *offset, int whence) { } else if (whence == SEEK_END) { new_offset = *offset + ss->endpos; } else { - LIBC_NAMESPACE::libc_errno = EINVAL; + libc_errno = EINVAL; return -1; } if (new_offset < 0 || size_t(new_offset) > ss->bufsize) @@ -115,7 +115,7 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) { ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_FAILURE(); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -149,7 +149,7 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) { LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_EQ(EBADF); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -178,7 +178,7 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) { ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_FAILURE(); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp index 72875600903a6..84984e26398c0 100644 --- a/libc/test/src/stdio/remove_test.cpp +++ b/libc/test/src/stdio/remove_test.cpp @@ -14,13 +14,13 @@ #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) { // The test strategy is to create a file and remove it, and also verify that // it was removed. - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; @@ -39,7 +39,7 @@ TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) { TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) { // The test strategy is to create a dir and remove it, and also verify that // it was removed. - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; constexpr const char *FILENAME = "remove.test.dir"; diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp index a5dd734c63616..ac494a4ecaf8e 100644 --- a/libc/test/src/stdio/rename_test.cpp +++ b/libc/test/src/stdio/rename_test.cpp @@ -8,7 +8,7 @@ #include "include/llvm-libc-macros/linux/sys-stat-macros.h" #include "include/llvm-libc-macros/linux/unistd-macros.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/rename.h" #include "src/unistd/access.h" @@ -19,7 +19,7 @@ TEST(LlvmLibcRenameTest, CreateAndRenameFile) { // The test strategy is to create a file and rename it, and also verify that // it was renamed. - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp index a1e1fee25db31..5872943c1bb41 100644 --- a/libc/test/src/stdio/setvbuf_test.cpp +++ b/libc/test/src/stdio/setvbuf_test.cpp @@ -14,7 +14,7 @@ #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" TEST(LlvmLibcSetvbufTest, SetNBFBuffer) { // The idea in this test is that we open a file for writing and reading, and @@ -102,6 +102,6 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) { 0); ASSERT_ERRNO_EQ(EINVAL); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f)); } diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp index f6af6ad3e364b..f1b545ba546f9 100644 --- a/libc/test/src/stdio/sprintf_test.cpp +++ b/libc/test/src/stdio/sprintf_test.cpp @@ -10,7 +10,7 @@ #include "src/stdio/sprintf.h" #include "src/__support/FPUtil/FPBits.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "test/UnitTest/RoundingModeUtils.h" #include "test/UnitTest/Test.h" #include @@ -3228,46 +3228,46 @@ TEST(LlvmLibcSPrintfTest, StrerrorConv) { char buff[1000]; int written; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%m"); ASSERT_STREQ_LEN(written, buff, "Success"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%m"); ASSERT_STREQ_LEN(written, buff, "Numerical result out of range"); // Check that it correctly consumes no arguments. - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%m %d", 1); ASSERT_STREQ_LEN(written, buff, "Success 1"); // Width Tests - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%10m"); ASSERT_STREQ_LEN(written, buff, " Success"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%10m"); ASSERT_STREQ_LEN(written, buff, "Numerical result out of range"); // Precision Tests - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%.10m"); ASSERT_STREQ_LEN(written, buff, "Success"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%.10m"); ASSERT_STREQ_LEN(written, buff, "Numerical "); // Flag Tests (Only '-' since the others only affect ints) - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%-10m"); ASSERT_STREQ_LEN(written, buff, "Success "); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%-10m"); ASSERT_STREQ_LEN(written, buff, "Numerical result out of range"); @@ -3275,93 +3275,93 @@ TEST(LlvmLibcSPrintfTest, StrerrorConv) { // Since alt mode here is effectively a completely separate conversion, it // gets separate tests. - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%#m"); ASSERT_STREQ_LEN(written, buff, "0"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#m"); ASSERT_STREQ_LEN(written, buff, "ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#m"); ASSERT_STREQ_LEN(written, buff, "-9999"); // Alt Mode Width - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%#10m"); ASSERT_STREQ_LEN(written, buff, " 0"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#10m"); ASSERT_STREQ_LEN(written, buff, " ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#10m"); ASSERT_STREQ_LEN(written, buff, " -9999"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#3m"); ASSERT_STREQ_LEN(written, buff, "ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#3m"); ASSERT_STREQ_LEN(written, buff, "-9999"); // Alt Mode Precision - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#.10m"); ASSERT_STREQ_LEN(written, buff, "ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#.10m"); ASSERT_STREQ_LEN(written, buff, "-0000009999"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#.3m"); ASSERT_STREQ_LEN(written, buff, "ERA"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#.3m"); ASSERT_STREQ_LEN(written, buff, "-9999"); // We don't test precision (or int flags) on errno = 0 because it behaves // weirdly, see the docs for more information. - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%#.1m"); ASSERT_STREQ_LEN(written, buff, "0"); // Alt Mode Flags // '-' flag - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; written = LIBC_NAMESPACE::sprintf(buff, "%#-10m"); ASSERT_STREQ_LEN(written, buff, "0 "); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#-10m"); ASSERT_STREQ_LEN(written, buff, "ERANGE "); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#-10m"); ASSERT_STREQ_LEN(written, buff, "-9999 "); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#-3m"); ASSERT_STREQ_LEN(written, buff, "ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#-3m"); ASSERT_STREQ_LEN(written, buff, "-9999"); // '+' flag - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#+m"); ASSERT_STREQ_LEN(written, buff, "ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#+m"); ASSERT_STREQ_LEN(written, buff, "-9999"); @@ -3370,38 +3370,38 @@ TEST(LlvmLibcSPrintfTest, StrerrorConv) { // come up, but I've avoided it for the other %m tests for ease of // refactoring if necessary. Here it needs to be positive to test that the // flags that only affect positive signed integers are properly passed along. - LIBC_NAMESPACE::libc_errno = 9999; + libc_errno = 9999; written = LIBC_NAMESPACE::sprintf(buff, "%#+m"); ASSERT_STREQ_LEN(written, buff, "+9999"); // ' ' flag - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%# m"); ASSERT_STREQ_LEN(written, buff, "ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%# m"); ASSERT_STREQ_LEN(written, buff, "-9999"); - LIBC_NAMESPACE::libc_errno = 9999; + libc_errno = 9999; written = LIBC_NAMESPACE::sprintf(buff, "%# m"); ASSERT_STREQ_LEN(written, buff, " 9999"); // '0' flag - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#010m"); ASSERT_STREQ_LEN(written, buff, " ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#010m"); ASSERT_STREQ_LEN(written, buff, "-000009999"); - LIBC_NAMESPACE::libc_errno = ERANGE; + libc_errno = ERANGE; written = LIBC_NAMESPACE::sprintf(buff, "%#03m"); ASSERT_STREQ_LEN(written, buff, "ERANGE"); - LIBC_NAMESPACE::libc_errno = -9999; + libc_errno = -9999; written = LIBC_NAMESPACE::sprintf(buff, "%#03m"); ASSERT_STREQ_LEN(written, buff, "-9999"); } diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp index 67f1b0ff513bc..5d482b70064bd 100644 --- a/libc/test/src/stdio/unlocked_fileop_test.cpp +++ b/libc/test/src/stdio/unlocked_fileop_test.cpp @@ -17,7 +17,7 @@ #include "src/stdio/fwrite_unlocked.h" #include "test/UnitTest/Test.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { constexpr char fNAME[] = "testdata/unlocked_read_and_write.test"; @@ -36,7 +36,7 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f)); ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0); ASSERT_ERRNO_FAILURE(); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr_unlocked(f); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0); @@ -57,7 +57,7 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f)); ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0); ASSERT_ERRNO_FAILURE(); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; LIBC_NAMESPACE::clearerr_unlocked(f); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0); diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h index 03f0a6539c785..3eeccc5727e77 100644 --- a/libc/test/src/stdlib/StrtolTest.h +++ b/libc/test/src/stdlib/StrtolTest.h @@ -9,6 +9,7 @@ #include "src/__support/CPP/limits.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/ctype_utils.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/properties/architectures.h" #include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/stdlib/strtoint32_test.cpp b/libc/test/src/stdlib/strtoint32_test.cpp index 17df432fc8e68..e6da692714d28 100644 --- a/libc/test/src/stdlib/strtoint32_test.cpp +++ b/libc/test/src/stdlib/strtoint32_test.cpp @@ -8,9 +8,9 @@ #include +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" #include "StrtolTest.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ int32_t strtoint32(const char *__restrict str, char **__restrict str_end, int base) { auto result = internal::strtointeger(str, base); if (result.has_error()) - LIBC_NAMESPACE::libc_errno = result.error; + libc_errno = result.error; if (str_end != nullptr) *str_end = const_cast(str + result.parsed_len); @@ -33,7 +33,7 @@ uint32_t strtouint32(const char *__restrict str, char **__restrict str_end, int base) { auto result = internal::strtointeger(str, base); if (result.has_error()) - LIBC_NAMESPACE::libc_errno = result.error; + libc_errno = result.error; if (str_end != nullptr) *str_end = const_cast(str + result.parsed_len); diff --git a/libc/test/src/stdlib/strtoint64_test.cpp b/libc/test/src/stdlib/strtoint64_test.cpp index b5fe69dfaa701..2c5d948f5fae2 100644 --- a/libc/test/src/stdlib/strtoint64_test.cpp +++ b/libc/test/src/stdlib/strtoint64_test.cpp @@ -8,9 +8,9 @@ #include +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" #include "src/__support/str_to_integer.h" -#include "src/errno/libc_errno.h" #include "StrtolTest.h" #include "test/UnitTest/Test.h" @@ -21,7 +21,7 @@ int64_t strtoint64(const char *__restrict str, char **__restrict str_end, int base) { auto result = internal::strtointeger(str, base); if (result.has_error()) - LIBC_NAMESPACE::libc_errno = result.error; + libc_errno = result.error; if (str_end != nullptr) *str_end = const_cast(str + result.parsed_len); @@ -33,7 +33,7 @@ uint64_t strtouint64(const char *__restrict str, char **__restrict str_end, int base) { auto result = internal::strtointeger(str, base); if (result.has_error()) - LIBC_NAMESPACE::libc_errno = result.error; + libc_errno = result.error; if (str_end != nullptr) *str_end = const_cast(str + result.parsed_len); diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp index eb4056dc7ba64..c2f2b9c9a11c3 100644 --- a/libc/test/src/stdlib/strtold_test.cpp +++ b/libc/test/src/stdlib/strtold_test.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/libc_errno.h" #include "src/__support/uint128.h" #include "src/stdlib/strtold.h" diff --git a/libc/test/src/sys/mman/linux/mlock_test.cpp b/libc/test/src/sys/mman/linux/mlock_test.cpp index 88abacad554e0..6b81411ca604a 100644 --- a/libc/test/src/sys/mman/linux/mlock_test.cpp +++ b/libc/test/src/sys/mman/linux/mlock_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/OSUtil/syscall.h" // For internal syscall function. -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/sys/mman/madvise.h" #include "src/sys/mman/mincore.h" #include "src/sys/mman/mlock.h" @@ -149,9 +149,8 @@ TEST_F(LlvmLibcMlockTest, MLockAll) { Succeeds()); auto retval = LIBC_NAMESPACE::mlockall(MCL_CURRENT); if (retval == -1) { - EXPECT_TRUE(LIBC_NAMESPACE::libc_errno == ENOMEM || - LIBC_NAMESPACE::libc_errno == EPERM); - LIBC_NAMESPACE::libc_errno = 0; + EXPECT_TRUE(libc_errno == ENOMEM || libc_errno == EPERM); + libc_errno = 0; return; } unsigned char vec; @@ -163,9 +162,8 @@ TEST_F(LlvmLibcMlockTest, MLockAll) { { auto retval = LIBC_NAMESPACE::mlockall(MCL_FUTURE); if (retval == -1) { - EXPECT_TRUE(LIBC_NAMESPACE::libc_errno == ENOMEM || - LIBC_NAMESPACE::libc_errno == EPERM); - LIBC_NAMESPACE::libc_errno = 0; + EXPECT_TRUE(libc_errno == ENOMEM || libc_errno == EPERM); + libc_errno = 0; return; } PageHolder holder; @@ -180,9 +178,8 @@ TEST_F(LlvmLibcMlockTest, MLockAll) { { auto retval = LIBC_NAMESPACE::mlockall(MCL_FUTURE | MCL_ONFAULT); if (retval == -1) { - EXPECT_TRUE(LIBC_NAMESPACE::libc_errno == ENOMEM || - LIBC_NAMESPACE::libc_errno == EPERM); - LIBC_NAMESPACE::libc_errno = 0; + EXPECT_TRUE(libc_errno == ENOMEM || libc_errno == EPERM); + libc_errno = 0; return; } PageHolder holder; diff --git a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp index 455a82678e18f..ba0ee4f09109e 100644 --- a/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp +++ b/libc/test/src/sys/statvfs/linux/fstatvfs_test.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/fcntl/open.h" #include "src/sys/stat/mkdirat.h" #include "src/sys/statvfs/fstatvfs.h" @@ -41,7 +41,7 @@ TEST_F(LlvmLibcSysFStatvfsTest, FStatvfsInvalidPath) { // Always delete the folder so that we start in a consistent state. LIBC_NAMESPACE::rmdir(TEST_DIR); - LIBC_NAMESPACE::libc_errno = 0; // Reset errno + libc_errno = 0; // Reset errno ASSERT_THAT(LIBC_NAMESPACE::mkdirat(AT_FDCWD, TEST_DIR, S_IRWXU), Succeeds(0)); diff --git a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp index f356bb3d277b6..327dec07a1b79 100644 --- a/libc/test/src/sys/statvfs/linux/statvfs_test.cpp +++ b/libc/test/src/sys/statvfs/linux/statvfs_test.cpp @@ -7,8 +7,8 @@ //===----------------------------------------------------------------------===// #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/config.h" -#include "src/errno/libc_errno.h" #include "src/sys/stat/mkdirat.h" #include "src/sys/statvfs/statvfs.h" #include "src/unistd/rmdir.h" @@ -37,7 +37,7 @@ TEST_F(LlvmLibcSysStatvfsTest, StatvfsInvalidPath) { // Always delete the folder so that we start in a consistent state. LIBC_NAMESPACE::rmdir(TEST_DIR); - LIBC_NAMESPACE::libc_errno = 0; // Reset errno + libc_errno = 0; // Reset errno ASSERT_THAT(LIBC_NAMESPACE::mkdirat(AT_FDCWD, TEST_DIR, S_IRWXU), Succeeds(0)); diff --git a/libc/test/src/sys/time/setitimer_test.cpp b/libc/test/src/sys/time/setitimer_test.cpp index 16d33fdf1e4f9..115f9e662ed46 100644 --- a/libc/test/src/sys/time/setitimer_test.cpp +++ b/libc/test/src/sys/time/setitimer_test.cpp @@ -24,7 +24,7 @@ static bool timer_fired(false); extern "C" void handle_sigalrm(int) { timer_fired = true; } TEST_F(LlvmLibcSysTimeSetitimerTest, SmokeTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; struct sigaction sa; sa.sa_handler = handle_sigalrm; LIBC_NAMESPACE::sigemptyset(&sa.sa_mask); diff --git a/libc/test/src/termios/termios_test.cpp b/libc/test/src/termios/termios_test.cpp index f8fc09a8bbf0e..5ec169a886b1e 100644 --- a/libc/test/src/termios/termios_test.cpp +++ b/libc/test/src/termios/termios_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/termios/cfgetispeed.h" #include "src/termios/cfgetospeed.h" @@ -30,21 +30,21 @@ using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; TEST(LlvmLibcTermiosTest, SpeedSmokeTest) { struct termios t; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::cfsetispeed(&t, B50), Succeeds(0)); ASSERT_EQ(LIBC_NAMESPACE::cfgetispeed(&t), speed_t(B50)); ASSERT_THAT(LIBC_NAMESPACE::cfsetospeed(&t, B75), Succeeds(0)); ASSERT_EQ(LIBC_NAMESPACE::cfgetospeed(&t), speed_t(B75)); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::cfsetispeed(&t, ~CBAUD), Fails(EINVAL)); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::cfsetospeed(&t, ~CBAUD), Fails(EINVAL)); } TEST(LlvmLibcTermiosTest, GetAttrSmokeTest) { struct termios t; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; int fd = LIBC_NAMESPACE::open("/dev/tty", O_RDONLY); if (fd < 0) return; // When /dev/tty is not available, no point continuing. @@ -54,7 +54,7 @@ TEST(LlvmLibcTermiosTest, GetAttrSmokeTest) { } TEST(LlvmLibcTermiosTest, TcGetSidSmokeTest) { - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; int fd = LIBC_NAMESPACE::open("/dev/tty", O_RDONLY); if (fd < 0) return; // When /dev/tty is not available, no point continuing. diff --git a/libc/test/src/time/asctime_r_test.cpp b/libc/test/src/time/asctime_r_test.cpp index b595cfe024866..d840248b7df42 100644 --- a/libc/test/src/time/asctime_r_test.cpp +++ b/libc/test/src/time/asctime_r_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/time/asctime_r.h" #include "src/time/time_constants.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/time/asctime_test.cpp b/libc/test/src/time/asctime_test.cpp index 169a7463a3037..cad25fffc65af 100644 --- a/libc/test/src/time/asctime_test.cpp +++ b/libc/test/src/time/asctime_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/time/asctime.h" #include "test/UnitTest/Test.h" #include "test/src/time/TmHelper.h" diff --git a/libc/test/src/time/ctime_r_test.cpp b/libc/test/src/time/ctime_r_test.cpp index 27011b7e0fbd6..fe43877aa499d 100644 --- a/libc/test/src/time/ctime_r_test.cpp +++ b/libc/test/src/time/ctime_r_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/time/ctime_r.h" #include "src/time/time_constants.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/time/ctime_test.cpp b/libc/test/src/time/ctime_test.cpp index 6f1168f0b6685..5ff69f6619b4f 100644 --- a/libc/test/src/time/ctime_test.cpp +++ b/libc/test/src/time/ctime_test.cpp @@ -6,7 +6,7 @@ // //===----------------------------------------------------------------------===// -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/time/ctime.h" #include "test/UnitTest/Test.h" #include "test/src/time/TmHelper.h" diff --git a/libc/test/src/time/gmtime_test.cpp b/libc/test/src/time/gmtime_test.cpp index 6af5a18d36996..41236665d2eaa 100644 --- a/libc/test/src/time/gmtime_test.cpp +++ b/libc/test/src/time/gmtime_test.cpp @@ -8,7 +8,7 @@ #include "hdr/types/struct_tm.h" #include "src/__support/CPP/limits.h" // INT_MAX, INT_MIN -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/time/gmtime.h" #include "src/time/time_constants.h" #include "test/UnitTest/ErrnoSetterMatcher.h" @@ -30,7 +30,7 @@ TEST(LlvmLibcGmTime, OutOfRange) { EXPECT_TRUE(tm_data == nullptr); ASSERT_ERRNO_EQ(EOVERFLOW); - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; seconds = INT_MIN * static_cast( diff --git a/libc/test/src/time/nanosleep_test.cpp b/libc/test/src/time/nanosleep_test.cpp index d4f98e29bd980..e0200ff3aaa26 100644 --- a/libc/test/src/time/nanosleep_test.cpp +++ b/libc/test/src/time/nanosleep_test.cpp @@ -7,7 +7,7 @@ //===----------------------------------------------------------------------===// #include "hdr/types/struct_timespec.h" -#include "src/errno/libc_errno.h" +#include "src/__support/libc_errno.h" #include "src/time/nanosleep.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" @@ -17,7 +17,7 @@ namespace cpp = LIBC_NAMESPACE::cpp; TEST(LlvmLibcNanosleep, SmokeTest) { // TODO: When we have the code to read clocks, test that time has passed. using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - LIBC_NAMESPACE::libc_errno = 0; + libc_errno = 0; struct timespec tim = {1, 500}; struct timespec tim2 = {0, 0}; diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel index b86d2f27e516a..123d9ccc8310f 100644 --- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel @@ -1601,6 +1601,7 @@ libc_support_library( libc_header_library( name = "libcxx_shared_headers", hdrs = [ + "shared/libc_common.h", "shared/fp_bits.h", "shared/str_to_float.h", "shared/str_to_integer.h", @@ -1618,7 +1619,7 @@ libc_header_library( libc_support_library( name = "errno", srcs = ["src/errno/libc_errno.cpp"], - hdrs = ["src/errno/libc_errno.h"], + hdrs = ["src/__support/libc_errno.h"], deps = [ ":__support_common", ":__support_cpp_atomic", From 79108da325daec08f5b50169a9c35e03ea0645a3 Mon Sep 17 00:00:00 2001 From: sribee8 <145801438+sribee8@users.noreply.github.com> Date: Wed, 11 Jun 2025 20:28:55 +0000 Subject: [PATCH 149/851] [libc][obvious] Changed incorrect type (#143780) After changing mbstate_t to mbstate we forgot to change the character_converter files to reflect it. Co-authored-by: Sriya Pratipati --- libc/src/__support/wchar/character_converter.cpp | 2 +- libc/src/__support/wchar/character_converter.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp index 0afc2a6f59e64..3cdb8ca83b7f0 100644 --- a/libc/src/__support/wchar/character_converter.cpp +++ b/libc/src/__support/wchar/character_converter.cpp @@ -16,7 +16,7 @@ namespace LIBC_NAMESPACE_DECL { namespace internal { -CharacterConverter::CharacterConverter(mbstate_t *mbstate) { state = mbstate; } +CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; } bool CharacterConverter::isComplete() {} diff --git a/libc/src/__support/wchar/character_converter.h b/libc/src/__support/wchar/character_converter.h index a6bac43805376..d0602d2defe22 100644 --- a/libc/src/__support/wchar/character_converter.h +++ b/libc/src/__support/wchar/character_converter.h @@ -19,10 +19,10 @@ namespace internal { class CharacterConverter { private: - mbstate_t *state; + mbstate *state; public: - CharacterConverter(mbstate_t *mbstate); + CharacterConverter(mbstate *mbstate); bool isComplete(); From c0c0f60ca14422dfbfe27fddd8d47faa596165d8 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Wed, 11 Jun 2025 22:09:55 +0100 Subject: [PATCH 150/851] [GlobalOpt] Bail out on non-ConstExprs in isSimpleEnoughtToCommit. (#143400) Bail out for non ConstantExpr constants in isSimpleEnoughValueToCommitHelper to prevent crash for non-ConstantExpr constants PR: https://github.com/llvm/llvm-project/pull/143400 --- llvm/lib/Transforms/Utils/Evaluator.cpp | 4 +- .../global-constructor-complex-constants.ll | 64 +++++++++++++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp index 2af447aadce22..d1db2ee29f3a2 100644 --- a/llvm/lib/Transforms/Utils/Evaluator.cpp +++ b/llvm/lib/Transforms/Utils/Evaluator.cpp @@ -77,7 +77,9 @@ isSimpleEnoughValueToCommitHelper(Constant *C, // We don't know exactly what relocations are allowed in constant expressions, // so we allow &global+constantoffset, which is safe and uniformly supported // across targets. - ConstantExpr *CE = cast(C); + ConstantExpr *CE = dyn_cast(C); + if (!CE) + return false; switch (CE->getOpcode()) { case Instruction::BitCast: // Bitcast is fine if the casted value is fine. diff --git a/llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll b/llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll new file mode 100644 index 0000000000000..6d9bdc41a0041 --- /dev/null +++ b/llvm/test/Transforms/GlobalOpt/global-constructor-complex-constants.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5 +; RUN: opt -p globalopt -S %s | FileCheck %s + +@llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @ctor, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_nocfi, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_dso_local_equivalent, ptr null }] + +@foo = internal global ptr null + +declare void @user(ptr) + +;. +; CHECK: @llvm.global_ctors = appending global [3 x { i32, ptr, ptr }] [{ i32, ptr, ptr } { i32 65535, ptr @ctor, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_nocfi, ptr null }, { i32, ptr, ptr } { i32 65535, ptr @ctor_dso_local_equivalent, ptr null }] +; CHECK: @foo = internal global ptr null +;. +define void @ctor() { +; CHECK-LABEL: define void @ctor() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DST:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr ptrauth (ptr @foo, i32 0), ptr [[DST]], align 8 +; CHECK-NEXT: call void @user(ptr [[DST]]) +; CHECK-NEXT: ret void +; +entry: + %dst = alloca ptr, align 8 + store ptr ptrauth (ptr @foo, i32 0), ptr %dst, align 8 + call void @user(ptr %dst) + ret void +} + +define void @ctor_nocfi() { +; CHECK-LABEL: define void @ctor_nocfi() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DST:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr no_cfi @foo, ptr [[DST]], align 8 +; CHECK-NEXT: call void @user(ptr [[DST]]) +; CHECK-NEXT: ret void +; +entry: + %dst = alloca ptr, align 8 + store ptr no_cfi @foo, ptr %dst, align 8 + call void @user(ptr %dst) + ret void +} + +define void @fn() { +; CHECK-LABEL: define void @fn() { +; CHECK-NEXT: ret void +; + ret void +} + +define void @ctor_dso_local_equivalent() { +; CHECK-LABEL: define void @ctor_dso_local_equivalent() { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[DST:%.*]] = alloca ptr, align 8 +; CHECK-NEXT: store ptr dso_local_equivalent @fn, ptr [[DST]], align 8 +; CHECK-NEXT: call void @user(ptr [[DST]]) +; CHECK-NEXT: ret void +; +entry: + %dst = alloca ptr, align 8 + store ptr dso_local_equivalent @fn, ptr %dst, align 8 + call void @user(ptr %dst) + ret void +} From f39f53e569f92987683626d910e9dbcbd59ff410 Mon Sep 17 00:00:00 2001 From: Shafik Yaghmour Date: Wed, 11 Jun 2025 14:11:19 -0700 Subject: [PATCH 151/851] [Clang][NFC] Move HeadingAndSpellings to avoid copying (#143611) Static analysis flagged that we could move HeadingAndSpellings and avoid a copy of a large object. --- clang/utils/TableGen/ClangAttrEmitter.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp index 21d76c12a3cce..42627f02cf356 100644 --- a/clang/utils/TableGen/ClangAttrEmitter.cpp +++ b/clang/utils/TableGen/ClangAttrEmitter.cpp @@ -5405,7 +5405,7 @@ void EmitClangAttrDocs(const RecordKeeper &Records, raw_ostream &OS) { // Handle Undocumented category separately - no content merging if (Cat == "Undocumented" && UndocumentedCategory) { UndocumentedDocs.push_back( - DocumentationData(Doc, Attr, HeadingAndSpellings)); + DocumentationData(Doc, Attr, std::move(HeadingAndSpellings))); continue; } From d7e7f22626f214766f3592341dd1737fd232c6a5 Mon Sep 17 00:00:00 2001 From: "Oleksandr T." Date: Thu, 12 Jun 2025 00:19:25 +0300 Subject: [PATCH 152/851] [Clang] fix missing source location for errors in macro-expanded (#143460) Fixes #143216 --- This patch fixes diagnostic locations for tokens from macro expansions. --- clang/docs/ReleaseNotes.rst | 1 + clang/include/clang/Parse/Parser.h | 4 +--- clang/lib/Parse/ParseExprCXX.cpp | 4 ++-- clang/lib/Parse/ParseStmt.cpp | 7 ++++-- clang/lib/Parse/Parser.cpp | 5 +++++ .../test/Parser/macro-expansion-recovery.cpp | 22 +++++++++++++++++++ clang/test/Parser/switch-recovery.cpp | 13 +++++++++++ 7 files changed, 49 insertions(+), 7 deletions(-) create mode 100644 clang/test/Parser/macro-expansion-recovery.cpp diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst index 8043ab48f0b4f..b42d5f8425af6 100644 --- a/clang/docs/ReleaseNotes.rst +++ b/clang/docs/ReleaseNotes.rst @@ -694,6 +694,7 @@ Bug Fixes in This Version - Constant evaluation now correctly runs the destructor of a variable declared in the second clause of a C-style ``for`` loop. (#GH139818) - Fixed a bug with constexpr evaluation for structs containing unions in case of C++ modules. (#GH143168) +- Fixed incorrect token location when emitting diagnostics for tokens expanded from macros. (#GH143216) Bug Fixes to Compiler Builtins ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h index 0b2fab4a45c96..d99de77a52919 100644 --- a/clang/include/clang/Parse/Parser.h +++ b/clang/include/clang/Parse/Parser.h @@ -290,9 +290,7 @@ class Parser : public CodeCompletionHandler { return ConsumeToken(); } - SourceLocation getEndOfPreviousToken() { - return PP.getLocForEndOfToken(PrevTokLocation); - } + SourceLocation getEndOfPreviousToken() const; /// GetLookAheadToken - This peeks ahead N tokens and returns that token /// without consuming any tokens. LookAhead(0) returns 'Tok', LookAhead(1) diff --git a/clang/lib/Parse/ParseExprCXX.cpp b/clang/lib/Parse/ParseExprCXX.cpp index d95260829e4a0..55ad7f256fa82 100644 --- a/clang/lib/Parse/ParseExprCXX.cpp +++ b/clang/lib/Parse/ParseExprCXX.cpp @@ -421,8 +421,8 @@ bool Parser::ParseOptionalCXXScopeSpecifier( // like we never saw it. Token Identifier = Tok; // Stash away the identifier. ConsumeToken(); // Eat the identifier, current token is now '::'. - Diag(PP.getLocForEndOfToken(ConsumeToken()), diag::err_expected) - << tok::identifier; + ConsumeToken(); + Diag(getEndOfPreviousToken(), diag::err_expected) << tok::identifier; UnconsumeToken(Identifier); // Stick the identifier back. Next = NextToken(); // Point Next at the '{' token. } diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp index c788723023c8b..c00759893b0c4 100644 --- a/clang/lib/Parse/ParseStmt.cpp +++ b/clang/lib/Parse/ParseStmt.cpp @@ -832,10 +832,13 @@ StmtResult Parser::ParseCaseStatement(ParsedStmtContext StmtCtx, << "'case'" << tok::colon << FixItHint::CreateReplacement(ColonLoc, ":"); } else { - SourceLocation ExpectedLoc = PP.getLocForEndOfToken(PrevTokLocation); + SourceLocation ExpectedLoc = getEndOfPreviousToken(); + Diag(ExpectedLoc, diag::err_expected_after) << "'case'" << tok::colon - << FixItHint::CreateInsertion(ExpectedLoc, ":"); + << FixItHint::CreateInsertion(ExpectedLoc, + tok::getTokenName(tok::colon)); + ColonLoc = ExpectedLoc; } diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp index db65c05cc114a..788ed79e0c1fa 100644 --- a/clang/lib/Parse/Parser.cpp +++ b/clang/lib/Parse/Parser.cpp @@ -1873,6 +1873,11 @@ Parser::TryAnnotateName(CorrectionCandidateCallback *CCC, return AnnotatedNameKind::Unresolved; } +SourceLocation Parser::getEndOfPreviousToken() const { + SourceLocation TokenEndLoc = PP.getLocForEndOfToken(PrevTokLocation); + return TokenEndLoc.isValid() ? TokenEndLoc : Tok.getLocation(); +} + bool Parser::TryKeywordIdentFallback(bool DisableKeyword) { assert(Tok.isNot(tok::identifier)); Diag(Tok, diag::ext_keyword_as_ident) diff --git a/clang/test/Parser/macro-expansion-recovery.cpp b/clang/test/Parser/macro-expansion-recovery.cpp new file mode 100644 index 0000000000000..6826cc04e4df5 --- /dev/null +++ b/clang/test/Parser/macro-expansion-recovery.cpp @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 -fsyntax-only -verify %s + +namespace GH143216 { +#define A x y +enum { A }; // expected-error {{missing ',' between enumerators}} + +#define B x y +void f() { + int a[2]; + auto [B] = a; // expected-error {{expected ','}} +} + +#define C class D; +D C; // expected-error {{expected unqualified-id}} \ + // expected-error {{expected '>'}} \ + // expected-note {{to match this '<'}} + +#define E F::{ +class F { E }}; // expected-error {{expected identifier}} \ + // expected-error {{expected member name or ';' after declaration specifiers}} +} diff --git a/clang/test/Parser/switch-recovery.cpp b/clang/test/Parser/switch-recovery.cpp index baf703cd03aed..7b3909e3b0d32 100644 --- a/clang/test/Parser/switch-recovery.cpp +++ b/clang/test/Parser/switch-recovery.cpp @@ -229,3 +229,16 @@ void fn1() { } } // expected-error{{expected statement}} } + +namespace GH143216 { +#define FOO 1 case 3: + +int f(int x) { + switch (x) { + case FOO // expected-error {{expected ':' after 'case'}} + return 0; + default: + return 1; + } +} +} From 625bfb7179ad1acab2aba1023095826628275a60 Mon Sep 17 00:00:00 2001 From: Jiachen Yuan Date: Wed, 11 Jun 2025 14:23:41 -0700 Subject: [PATCH 153/851] Workaround MSVC Linker Issue when Cross-Compiling for ARM64EC (#143659) This MR presents a temporary workaround for the issue described at https://github.com/llvm/llvm-project/issues/143575. While an [upstream MSVC bug](https://developercommunity.visualstudio.com/t/MSVC-Linker-Issue-When-Cross-Compiling-L/10920141) is reported, it makes sense to apply a workaround in LLVM code to quickly unblock anyone affected. --- llvm/include/llvm/IR/Mangler.h | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/llvm/include/llvm/IR/Mangler.h b/llvm/include/llvm/IR/Mangler.h index e3dfe1eac6189..232101a8926b7 100644 --- a/llvm/include/llvm/IR/Mangler.h +++ b/llvm/include/llvm/IR/Mangler.h @@ -26,7 +26,16 @@ class Triple; class Twine; class raw_ostream; -constexpr std::string_view HybridPatchableTargetSuffix = "$hp_target"; +// TODO: The weird assignment of HybridPatchableTargetSuffix below is a +// temporary workaround for a linker failure that is only hit when compiling +// llvm for arm64ec on windows. The description and context of the issue is at +// https://github.com/llvm/llvm-project/issues/143575. +// An upstream MSVC bug is filed at +// https://developercommunity.visualstudio.com/t/MSVC-Linker-Issue-When-Cross- +// Compiling-L/10920141. +constexpr char HybridPatchableTargetSuffixArr[] = "$hp_target"; +constexpr std::string_view HybridPatchableTargetSuffix = + HybridPatchableTargetSuffixArr; class Mangler { /// We need to give global values the same name every time they are mangled. From 7838fc0cd3fbe578d9554fdcd3198c2ba3616bcc Mon Sep 17 00:00:00 2001 From: Sirraide Date: Wed, 11 Jun 2025 23:24:33 +0200 Subject: [PATCH 154/851] [Clang] [NFC] Move diagnostics emitting code from `DiagnosticIDs` into `DiagnosticsEngine` (#143517) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit It makes more sense for this functionality to be all in one place rather than split up across two files—at least it caused me a bit of a headache to try and find all places where we were actually forwarding the diagnostic to the `DiagnosticConsumer`. Moreover, moving these functions into `DiagnosticsEngine` simplifies the code quite a bit since we access members of `DiagnosticsEngine` more frequently than those of `DiagnosticIDs`. There was also a duplicated code snippet that I’ve moved out into a new function. --- clang/include/clang/Basic/Diagnostic.h | 23 +++--- clang/include/clang/Basic/DiagnosticIDs.h | 12 --- clang/lib/Basic/Diagnostic.cpp | 98 ++++++++++++++++++++--- clang/lib/Basic/DiagnosticIDs.cpp | 97 ---------------------- 4 files changed, 102 insertions(+), 128 deletions(-) diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h index e9c54c3c487c9..efee8302e7501 100644 --- a/clang/include/clang/Basic/Diagnostic.h +++ b/clang/include/clang/Basic/Diagnostic.h @@ -18,6 +18,7 @@ #include "clang/Basic/DiagnosticOptions.h" #include "clang/Basic/SourceLocation.h" #include "clang/Basic/Specifiers.h" +#include "clang/Basic/UnsignedOrNone.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/FunctionExtras.h" @@ -49,6 +50,7 @@ class FileSystem; namespace clang { class DeclContext; +class Diagnostic; class DiagnosticBuilder; class DiagnosticConsumer; class IdentifierInfo; @@ -228,6 +230,8 @@ class DiagStorageAllocator { class DiagnosticsEngine : public RefCountedBase { public: /// The level of the diagnostic, after it has been through mapping. + // FIXME: Make this an alias for DiagnosticIDs::Level as soon as + // we can use 'using enum'. enum Level { Ignored = DiagnosticIDs::Ignored, Note = DiagnosticIDs::Note, @@ -532,7 +536,7 @@ class DiagnosticsEngine : public RefCountedBase { /// /// This is used to emit continuation diagnostics with the same level as the /// diagnostic that they follow. - DiagnosticIDs::Level LastDiagLevel; + Level LastDiagLevel; /// Number of warnings reported unsigned NumWarnings; @@ -777,18 +781,16 @@ class DiagnosticsEngine : public RefCountedBase { /// the middle of another diagnostic. /// /// This can be used by clients who suppress diagnostics themselves. - void setLastDiagnosticIgnored(bool Ignored) { - if (LastDiagLevel == DiagnosticIDs::Fatal) + void setLastDiagnosticIgnored(bool IsIgnored) { + if (LastDiagLevel == Fatal) FatalErrorOccurred = true; - LastDiagLevel = Ignored ? DiagnosticIDs::Ignored : DiagnosticIDs::Warning; + LastDiagLevel = IsIgnored ? Ignored : Warning; } /// Determine whether the previous diagnostic was ignored. This can /// be used by clients that want to determine whether notes attached to a /// diagnostic will be suppressed. - bool isLastDiagnosticIgnored() const { - return LastDiagLevel == DiagnosticIDs::Ignored; - } + bool isLastDiagnosticIgnored() const { return LastDiagLevel == Ignored; } /// Controls whether otherwise-unmapped extension diagnostics are /// mapped onto ignore/warning/error. @@ -1024,9 +1026,10 @@ class DiagnosticsEngine : public RefCountedBase { /// Used to report a diagnostic that is finally fully formed. /// /// \returns true if the diagnostic was emitted, false if it was suppressed. - bool ProcessDiag(const DiagnosticBuilder &DiagBuilder) { - return Diags->ProcessDiag(*this, DiagBuilder); - } + bool ProcessDiag(const DiagnosticBuilder &DiagBuilder); + + /// Forward a diagnostic to the DiagnosticConsumer. + void Report(Level DiagLevel, const Diagnostic &Info); /// @name Diagnostic Emission /// @{ diff --git a/clang/include/clang/Basic/DiagnosticIDs.h b/clang/include/clang/Basic/DiagnosticIDs.h index 80d52a0d01112..2b095f0fd6741 100644 --- a/clang/include/clang/Basic/DiagnosticIDs.h +++ b/clang/include/clang/Basic/DiagnosticIDs.h @@ -483,18 +483,6 @@ class DiagnosticIDs : public RefCountedBase { Class getDiagClass(unsigned DiagID) const; - /// Used to report a diagnostic that is finally fully formed. - /// - /// \returns \c true if the diagnostic was emitted, \c false if it was - /// suppressed. - bool ProcessDiag(DiagnosticsEngine &Diag, - const DiagnosticBuilder &DiagBuilder) const; - - /// Used to emit a diagnostic that is finally fully formed, - /// ignoring suppression. - void EmitDiag(DiagnosticsEngine &Diag, const DiagnosticBuilder &DiagBuilder, - Level DiagLevel) const; - /// Whether the diagnostic may leave the AST in a state where some /// invariants can break. bool isUnrecoverable(unsigned DiagID) const; diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp index 694224071347a..95d86cb153b4b 100644 --- a/clang/lib/Basic/Diagnostic.cpp +++ b/clang/lib/Basic/Diagnostic.cpp @@ -130,7 +130,7 @@ void DiagnosticsEngine::Reset(bool soft /*=false*/) { TrapNumErrorsOccurred = 0; TrapNumUnrecoverableErrorsOccurred = 0; - LastDiagLevel = DiagnosticIDs::Ignored; + LastDiagLevel = Ignored; if (!soft) { // Clear state related to #pragma diagnostic. @@ -658,13 +658,95 @@ void DiagnosticsEngine::Report(const StoredDiagnostic &storedDiag) { Level DiagLevel = storedDiag.getLevel(); Diagnostic Info(this, storedDiag.getLocation(), storedDiag.getID(), DiagStorage, storedDiag.getMessage()); + Report(DiagLevel, Info); +} + +void DiagnosticsEngine::Report(Level DiagLevel, const Diagnostic &Info) { + assert(DiagLevel != Ignored && "Cannot emit ignored diagnostics!"); Client->HandleDiagnostic(DiagLevel, Info); if (Client->IncludeInDiagnosticCounts()) { - if (DiagLevel == DiagnosticsEngine::Warning) + if (DiagLevel == Warning) ++NumWarnings; } } +/// ProcessDiag - This is the method used to report a diagnostic that is +/// finally fully formed. +bool DiagnosticsEngine::ProcessDiag(const DiagnosticBuilder &DiagBuilder) { + Diagnostic Info(this, DiagBuilder); + + assert(getClient() && "DiagnosticClient not set!"); + + // Figure out the diagnostic level of this message. + unsigned DiagID = Info.getID(); + Level DiagLevel = getDiagnosticLevel(DiagID, Info.getLocation()); + + // Update counts for DiagnosticErrorTrap even if a fatal error occurred + // or diagnostics are suppressed. + if (DiagLevel >= Error) { + ++TrapNumErrorsOccurred; + if (Diags->isUnrecoverable(DiagID)) + ++TrapNumUnrecoverableErrorsOccurred; + } + + if (SuppressAllDiagnostics) + return false; + + if (DiagLevel != Note) { + // Record that a fatal error occurred only when we see a second + // non-note diagnostic. This allows notes to be attached to the + // fatal error, but suppresses any diagnostics that follow those + // notes. + if (LastDiagLevel == Fatal) + FatalErrorOccurred = true; + + LastDiagLevel = DiagLevel; + } + + // If a fatal error has already been emitted, silence all subsequent + // diagnostics. + if (FatalErrorOccurred) { + if (DiagLevel >= Error && Client->IncludeInDiagnosticCounts()) + ++NumErrors; + + return false; + } + + // If the client doesn't care about this message, don't issue it. If this is + // a note and the last real diagnostic was ignored, ignore it too. + if (DiagLevel == Ignored || (DiagLevel == Note && LastDiagLevel == Ignored)) + return false; + + if (DiagLevel >= Error) { + if (Diags->isUnrecoverable(DiagID)) + UnrecoverableErrorOccurred = true; + + // Warnings which have been upgraded to errors do not prevent compilation. + if (Diags->isDefaultMappingAsError(DiagID)) + UncompilableErrorOccurred = true; + + ErrorOccurred = true; + if (Client->IncludeInDiagnosticCounts()) + ++NumErrors; + + // If we've emitted a lot of errors, emit a fatal error instead of it to + // stop a flood of bogus errors. + if (ErrorLimit && NumErrors > ErrorLimit && DiagLevel == Error) { + Report(diag::fatal_too_many_errors); + return false; + } + } + + // Make sure we set FatalErrorOccurred to ensure that the notes from the + // diagnostic that caused `fatal_too_many_errors` won't be emitted. + if (Info.getID() == diag::fatal_too_many_errors) + FatalErrorOccurred = true; + + // Finally, report it. + Report(DiagLevel, Info); + return true; +} + bool DiagnosticsEngine::EmitDiagnostic(const DiagnosticBuilder &DB, bool Force) { assert(getClient() && "DiagnosticClient not set!"); @@ -674,14 +756,12 @@ bool DiagnosticsEngine::EmitDiagnostic(const DiagnosticBuilder &DB, Diagnostic Info(this, DB); // Figure out the diagnostic level of this message. - DiagnosticIDs::Level DiagLevel = - Diags->getDiagnosticLevel(Info.getID(), Info.getLocation(), *this); + Level DiagLevel = getDiagnosticLevel(Info.getID(), Info.getLocation()); - Emitted = (DiagLevel != DiagnosticIDs::Ignored); - if (Emitted) { - // Emit the diagnostic regardless of suppression level. - Diags->EmitDiag(*this, DB, DiagLevel); - } + // Emit the diagnostic regardless of suppression level. + Emitted = DiagLevel != Ignored; + if (Emitted) + Report(DiagLevel, Info); } else { // Process the diagnostic, sending the accumulated information to the // DiagnosticConsumer. diff --git a/clang/lib/Basic/DiagnosticIDs.cpp b/clang/lib/Basic/DiagnosticIDs.cpp index 3e90b2d804773..dcf0c6cb54282 100644 --- a/clang/lib/Basic/DiagnosticIDs.cpp +++ b/clang/lib/Basic/DiagnosticIDs.cpp @@ -823,103 +823,6 @@ unsigned DiagnosticIDs::getCXXCompatDiagId(const LangOptions &LangOpts, return StdVer >= D.StdVer ? D.DiagId : D.PreDiagId; } -/// ProcessDiag - This is the method used to report a diagnostic that is -/// finally fully formed. -bool DiagnosticIDs::ProcessDiag(DiagnosticsEngine &Diag, - const DiagnosticBuilder &DiagBuilder) const { - Diagnostic Info(&Diag, DiagBuilder); - - assert(Diag.getClient() && "DiagnosticClient not set!"); - - // Figure out the diagnostic level of this message. - unsigned DiagID = Info.getID(); - DiagnosticIDs::Level DiagLevel - = getDiagnosticLevel(DiagID, Info.getLocation(), Diag); - - // Update counts for DiagnosticErrorTrap even if a fatal error occurred - // or diagnostics are suppressed. - if (DiagLevel >= DiagnosticIDs::Error) { - ++Diag.TrapNumErrorsOccurred; - if (isUnrecoverable(DiagID)) - ++Diag.TrapNumUnrecoverableErrorsOccurred; - } - - if (Diag.SuppressAllDiagnostics) - return false; - - if (DiagLevel != DiagnosticIDs::Note) { - // Record that a fatal error occurred only when we see a second - // non-note diagnostic. This allows notes to be attached to the - // fatal error, but suppresses any diagnostics that follow those - // notes. - if (Diag.LastDiagLevel == DiagnosticIDs::Fatal) - Diag.FatalErrorOccurred = true; - - Diag.LastDiagLevel = DiagLevel; - } - - // If a fatal error has already been emitted, silence all subsequent - // diagnostics. - if (Diag.FatalErrorOccurred) { - if (DiagLevel >= DiagnosticIDs::Error && - Diag.Client->IncludeInDiagnosticCounts()) { - ++Diag.NumErrors; - } - - return false; - } - - // If the client doesn't care about this message, don't issue it. If this is - // a note and the last real diagnostic was ignored, ignore it too. - if (DiagLevel == DiagnosticIDs::Ignored || - (DiagLevel == DiagnosticIDs::Note && - Diag.LastDiagLevel == DiagnosticIDs::Ignored)) - return false; - - if (DiagLevel >= DiagnosticIDs::Error) { - if (isUnrecoverable(DiagID)) - Diag.UnrecoverableErrorOccurred = true; - - // Warnings which have been upgraded to errors do not prevent compilation. - if (isDefaultMappingAsError(DiagID)) - Diag.UncompilableErrorOccurred = true; - - Diag.ErrorOccurred = true; - if (Diag.Client->IncludeInDiagnosticCounts()) { - ++Diag.NumErrors; - } - - // If we've emitted a lot of errors, emit a fatal error instead of it to - // stop a flood of bogus errors. - if (Diag.ErrorLimit && Diag.NumErrors > Diag.ErrorLimit && - DiagLevel == DiagnosticIDs::Error) { - Diag.Report(diag::fatal_too_many_errors); - return false; - } - } - - // Make sure we set FatalErrorOccurred to ensure that the notes from the - // diagnostic that caused `fatal_too_many_errors` won't be emitted. - if (Info.getID() == diag::fatal_too_many_errors) - Diag.FatalErrorOccurred = true; - // Finally, report it. - EmitDiag(Diag, DiagBuilder, DiagLevel); - return true; -} - -void DiagnosticIDs::EmitDiag(DiagnosticsEngine &Diag, - const DiagnosticBuilder &DiagBuilder, - Level DiagLevel) const { - Diagnostic Info(&Diag, DiagBuilder); - assert(DiagLevel != DiagnosticIDs::Ignored && "Cannot emit ignored diagnostics!"); - - Diag.Client->HandleDiagnostic((DiagnosticsEngine::Level)DiagLevel, Info); - if (Diag.Client->IncludeInDiagnosticCounts()) { - if (DiagLevel == DiagnosticIDs::Warning) - ++Diag.NumWarnings; - } -} - bool DiagnosticIDs::isUnrecoverable(unsigned DiagID) const { // Only errors may be unrecoverable. if (getDiagClass(DiagID) < CLASS_ERROR) From 6f2ba4712f17d7c82228a5b705570571e13a3832 Mon Sep 17 00:00:00 2001 From: Ian Wood Date: Wed, 11 Jun 2025 14:34:02 -0700 Subject: [PATCH 155/851] [mlir] Fix ComposeExpandOfCollapseOp for dynamic case (#142663) Changes `findCollapsingReassociation` to return nullopt in all cases where source shape has `>=2` dynamic dims. `expand(collapse)` can reshape to in any valid output shape but a collapse can only collapse contiguous dimensions. When there are `>=2` dynamic dimensions it is impossible to determine if it can be simplified to a collapse or if it is preforming a more advanced reassociation. This problem was uncovered by https://github.com/llvm/llvm-project/pull/137963 --------- Signed-off-by: Ian Wood --- mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h | 9 ++++++--- mlir/test/Dialect/Tensor/canonicalize.mlir | 14 ++++++++++++++ 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h index af575e10acc8e..61c2a50e514ca 100644 --- a/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h +++ b/mlir/include/mlir/Dialect/Utils/ReshapeOpsUtils.h @@ -387,11 +387,14 @@ struct ComposeExpandOfCollapseOp : public OpRewritePattern { auto resultSubShape = resultShape.slice(resultIndices.front(), resultIndices.size()); + if (llvm::count_if(srcSubShape, ShapedType::isDynamic) >= 2 && + llvm::count_if(resultSubShape, ShapedType::isDynamic) >= 2) + return std::nullopt; + if (srcSubShape.size() == resultSubShape.size()) { - if (srcSubShape != resultSubShape || - llvm::count_if(srcSubShape, ShapedType::isDynamic) >= 2) { + if (srcSubShape != resultSubShape) return std::nullopt; - } + for (auto index : llvm::seq(0, srcSubShape.size())) { composedReassociation.emplace_back(1, srcIndices.front() + index); } diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir index 65c5b3e8602eb..67b03b0a3485b 100644 --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -1272,6 +1272,20 @@ func.func @compose_expand_of_collapse_dynamic(%arg0 : tensor<4x?x10x64x2xf16>, % // ----- +func.func @no_compose_collapse_of_expand_dynamic(%arg0 : tensor, %arg1: index) -> tensor { + %collapse = tensor.collapse_shape %arg0 [[0, 1, 2, 3]] : tensor into tensor + %expanded_19 = tensor.expand_shape %collapse [[0, 1, 2]] output_shape [%arg1, 8, %arg1] : tensor into tensor + return %expanded_19 : tensor +} +// CHECK-LABEL: func @no_compose_collapse_of_expand_dynamic +// CHECK-SAME: %[[ARG0:.+]]: tensor +// CHECK-SAME: %[[ARG1:.+]]: index +// CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %[[ARG0]] +// CHECK: %[[EXPAND:.+]] = tensor.expand_shape %[[COLLAPSE]] +// CHECK: return %[[EXPAND]] + +// ----- + // CHECK-LABEL: func @zero_rank_reshape_multi func.func @zero_rank_reshape_multi(%arg0: tensor) -> tensor { // CHECK: return %arg0 From 9c9a4a284e95ea5e27617af7235e3ab049bae680 Mon Sep 17 00:00:00 2001 From: Ellis Hoag Date: Wed, 11 Jun 2025 14:54:30 -0700 Subject: [PATCH 156/851] [LOH] Don't emit AdrpAddStr when register could be clobbered (#142849) https://github.com/llvm/llvm-project/commit/b783aa89795635cbe7b25b4143b562931fcec9f6 added a check to ensure an `AdrpAddLdr` LOH isn't created when there is an instruction between the `add` and `ldr` https://github.com/llvm/llvm-project/blob/50c5704dc000cc0af41a511aa44db03233edf0af/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp#L419-L431 We need a similar check for `AdrpAddStr`. Although this technically isn't implemented in LLD, it could be in the future. https://github.com/llvm/llvm-project/blob/50c5704dc000cc0af41a511aa44db03233edf0af/lld/MachO/Arch/ARM64.cpp#L699-L702 --- llvm/lib/Target/AArch64/AArch64CollectLOH.cpp | 37 +++++++++++-------- .../AArch64/loh-adrp-add-ldr-clobber.mir | 37 +++++++++++++------ 2 files changed, 48 insertions(+), 26 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp index 53e8e438c5e57..064716216d1cb 100644 --- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp +++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp @@ -247,6 +247,17 @@ static bool supportLoadFromLiteral(const MachineInstr &MI) { } } +/// Returns \p true if there are no non-debug instructions between \p First and +/// \p Second +static bool areInstructionsConsecutive(const MachineInstr *First, + const MachineInstr *Second) { + auto It = First->getIterator(); + auto EndIt = First->getParent()->instr_end(); + if (It == EndIt) + return false; + return next_nodbg(It, EndIt) == Second->getIterator(); +} + /// Number of GPR registers tracked by mapRegToGPRIndex() static const unsigned N_GPR_REGS = 31; /// Map register number to index from 0-30. @@ -415,7 +426,7 @@ static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI, ++NumADRPToLDR; } break; - case MCLOH_AdrpAddLdr: { + case MCLOH_AdrpAddLdr: // There is a possibility that the linker may try to rewrite: // adrp x0, @sym@PAGE // add x1, x0, @sym@PAGEOFF @@ -432,28 +443,24 @@ static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI, // FIXME: Implement proper liveness tracking for all registers. For now, // don't emit the LOH if there are any instructions between the add and // the ldr. - MachineInstr *AddMI = const_cast(Info.MI1); - const MachineInstr *LdrMI = Info.MI0; - auto AddIt = MachineBasicBlock::iterator(AddMI); - auto EndIt = AddMI->getParent()->end(); - if (AddMI->getIterator() == EndIt || LdrMI != &*next_nodbg(AddIt, EndIt)) + if (!areInstructionsConsecutive(Info.MI1, Info.MI0)) break; - LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddLdr:\n" << '\t' << MI << '\t' << *Info.MI1 << '\t' << *Info.MI0); AFI.addLOHDirective(MCLOH_AdrpAddLdr, {&MI, Info.MI1, Info.MI0}); ++NumADDToLDR; break; - } case MCLOH_AdrpAddStr: - if (Info.MI1 != nullptr) { - LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n" - << '\t' << MI << '\t' << *Info.MI1 << '\t' - << *Info.MI0); - AFI.addLOHDirective(MCLOH_AdrpAddStr, {&MI, Info.MI1, Info.MI0}); - ++NumADDToSTR; - } + if (!Info.MI1) + break; + if (!areInstructionsConsecutive(Info.MI1, Info.MI0)) + break; + LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAddStr:\n" + << '\t' << MI << '\t' << *Info.MI1 << '\t' + << *Info.MI0); + AFI.addLOHDirective(MCLOH_AdrpAddStr, {&MI, Info.MI1, Info.MI0}); + ++NumADDToSTR; break; case MCLOH_AdrpLdrGotLdr: LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdrGotLdr:\n" diff --git a/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir b/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir index ce2d8f02f4cc8..a1d8bf375a19b 100644 --- a/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir +++ b/llvm/test/CodeGen/AArch64/loh-adrp-add-ldr-clobber.mir @@ -1,16 +1,34 @@ -# RUN: llc -o /dev/null %s -mtriple=aarch64-apple-ios -run-pass=aarch64-collect-loh -debug-only=aarch64-collect-loh 2>&1 | FileCheck %s +# RUN: llc -o /dev/null %s -mtriple=aarch64-apple-ios -run-pass=aarch64-collect-loh -debug-only=aarch64-collect-loh 2>&1 | FileCheck %s --implicit-check-not=MCLOH_ # REQUIRES: asserts + +# Check that we don't emit LOHs when there is a clobbering def of x8. --- | @sym2 = local_unnamed_addr global [10000000 x i32] zeroinitializer, align 8 @sym = local_unnamed_addr global i32 zeroinitializer, align 8 - define i32 @main() { - ret i32 0 - } + define i32 @adrp_add_ldr() { ret i32 0 } + define i32 @adrp_add_str() { ret i32 0 } +... + +--- +name: adrp_add_ldr +alignment: 4 +tracksRegLiveness: true +liveins: + - { reg: '$x21', virtual-reg: '' } +body: | + bb.0: + liveins: $x21 + renamable $x8 = ADRP target-flags(aarch64-page) @sym + renamable $x9 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @sym, 0 + renamable $x8 = ADDXri killed renamable $x21, 1, 0 + $x9 = LDRXui $x9, 0 + RET undef $lr ... + --- -name: main +name: adrp_add_str alignment: 4 tracksRegLiveness: true liveins: @@ -19,13 +37,10 @@ liveins: body: | bb.0: liveins: $x21, $x22 - ; Check we don't emit an loh here because there's a clobbering def of x8 before the ldr. - ; CHECK-LABEL: main - ; CHECK-NOT: MCLOH_AdrpAddLdr renamable $x8 = ADRP target-flags(aarch64-page) @sym renamable $x9 = ADDXri killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) @sym, 0 - renamable $x8 = ADDXri killed renamable $x22, 1, 0 - $x9 = LDRXui $x9, 0 - RET undef $lr + renamable $x8 = ADDXri killed renamable $x21, 1, 0 + STRXui $x22, $x9, 0 + RET undef $lr ... From 74172add65aa14e77e98b048db0074c3f273057f Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Wed, 11 Jun 2025 18:18:22 -0400 Subject: [PATCH 157/851] [mlir][generate-test-checks] Do not emit the autogenerated note if it exists (#143750) Prior to this PR, the script removed the already existing autogenerated note if we came across a line that was equal to the note. But the default note is multiple lines, so there would never be a match. Instead, check to see if the current line is a substring of the autogenerated note. Co-authored-by: Michael Maitland --- mlir/utils/generate-test-checks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py index 11fb4e40072e7..f77c9688d9318 100755 --- a/mlir/utils/generate-test-checks.py +++ b/mlir/utils/generate-test-checks.py @@ -208,7 +208,7 @@ def process_source_lines(source_lines, note, args): source_segments = [[]] for line in source_lines: # Remove previous note. - if line == note: + if line in note: continue # Remove previous CHECK lines. if line.find(args.check_prefix) != -1: From 0e457315f55889878ccbc3e35d4beb04e277733f Mon Sep 17 00:00:00 2001 From: Michael Maitland Date: Wed, 11 Jun 2025 18:19:15 -0400 Subject: [PATCH 158/851] [mlir][generate-test-checks] Emit attributes with rest of CHECK lines (#143759) Prior to this patch, generating test checks in place put the ATTR definitions at the very top of the file, above the RUN lines and autogenerated note. All CHECK lines should below the RUN lines and autogenerated note. This change ensures that the attribute definitions are emitted with the rest of the CHECK lines. --------- Co-authored-by: Michael Maitland --- mlir/utils/generate-test-checks.py | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/mlir/utils/generate-test-checks.py b/mlir/utils/generate-test-checks.py index f77c9688d9318..14a790e6d0e6e 100755 --- a/mlir/utils/generate-test-checks.py +++ b/mlir/utils/generate-test-checks.py @@ -220,12 +220,19 @@ def process_source_lines(source_lines, note, args): source_segments[-1].append(line + "\n") return source_segments -def process_attribute_definition(line, attribute_namer, output): + +def process_attribute_definition(line, attribute_namer): m = ATTR_DEF_RE.match(line) if m: attribute_name = attribute_namer.generate_name(m.group(1)) - line = '// CHECK: #[[' + attribute_name + ':.+]] =' + line[len(m.group(0)):] + '\n' - output.write(line) + return ( + "// CHECK: #[[" + + attribute_name + + ":.+]] =" + + line[len(m.group(0)) :] + + "\n" + ) + return None def process_attribute_references(line, attribute_namer): @@ -340,6 +347,9 @@ def main(): variable_namer = VariableNamer(args.variable_names) attribute_namer = AttributeNamer(args.attribute_names) + # Store attribute definitions to emit at appropriate scope + pending_attr_defs = [] + # Process lines for input_line in input_lines: if not input_line: @@ -350,8 +360,9 @@ def main(): if input_line.startswith("// -----"): continue - # Check if this is an attribute definition and process it - process_attribute_definition(input_line, attribute_namer, output) + if ATTR_DEF_RE.match(input_line): + pending_attr_defs.append(input_line) + continue # Lines with blocks begin with a ^. These lines have a trailing comment # that needs to be stripped. @@ -407,6 +418,13 @@ def main(): output_line += process_line(ssa_split[1:], variable_namer) else: + # Emit any pending attribute definitions at the start of this scope + for attr in pending_attr_defs: + attr_line = process_attribute_definition(attr, attribute_namer) + if attr_line: + output_segments[-1].append(attr_line) + pending_attr_defs.clear() + # Output the first line chunk that does not contain an SSA name for the # label. output_line = "// " + args.check_prefix + "-LABEL: " + ssa_split[0] + "\n" From ee35e342945d6825c9b2b004fd135cf16c84ea0e Mon Sep 17 00:00:00 2001 From: Nikolay Panchenko Date: Wed, 11 Jun 2025 19:00:29 -0400 Subject: [PATCH 159/851] [ConstantFolding] Add folding for [de]interleave2, insert and extract (#141301) The change adds folding for 4 vector intrinsics: `interleave2`, `deinterleave2`, `vector_extract` and `vector_insert`. For the last 2 intrinsics the change does not use `ShuffleVector` fold mechanism as it's much simpler to construct result vector explicitly. --- llvm/lib/Analysis/ConstantFolding.cpp | 97 +++++++++++++++++++ .../InstSimplify/ConstProp/vector-calls.ll | 68 +++++++++++++ 2 files changed, 165 insertions(+) create mode 100644 llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp index 1ef0badd23757..139a0b81e299b 100644 --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -1635,6 +1635,10 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) { case Intrinsic::vector_reduce_smax: case Intrinsic::vector_reduce_umin: case Intrinsic::vector_reduce_umax: + case Intrinsic::vector_extract: + case Intrinsic::vector_insert: + case Intrinsic::vector_interleave2: + case Intrinsic::vector_deinterleave2: // Target intrinsics case Intrinsic::amdgcn_perm: case Intrinsic::amdgcn_wave_reduce_umin: @@ -3758,6 +3762,72 @@ static Constant *ConstantFoldFixedVectorCall( } return nullptr; } + case Intrinsic::vector_extract: { + auto *Idx = dyn_cast(Operands[1]); + Constant *Vec = Operands[0]; + if (!Idx || !isa(Vec->getType())) + return nullptr; + + unsigned NumElements = FVTy->getNumElements(); + unsigned VecNumElements = + cast(Vec->getType())->getNumElements(); + unsigned StartingIndex = Idx->getZExtValue(); + + // Extracting entire vector is nop + if (NumElements == VecNumElements && StartingIndex == 0) + return Vec; + + for (unsigned I = StartingIndex, E = StartingIndex + NumElements; I < E; + ++I) { + Constant *Elt = Vec->getAggregateElement(I); + if (!Elt) + return nullptr; + Result[I - StartingIndex] = Elt; + } + + return ConstantVector::get(Result); + } + case Intrinsic::vector_insert: { + Constant *Vec = Operands[0]; + Constant *SubVec = Operands[1]; + auto *Idx = dyn_cast(Operands[2]); + if (!Idx || !isa(Vec->getType())) + return nullptr; + + unsigned SubVecNumElements = + cast(SubVec->getType())->getNumElements(); + unsigned VecNumElements = + cast(Vec->getType())->getNumElements(); + unsigned IdxN = Idx->getZExtValue(); + // Replacing entire vector with a subvec is nop + if (SubVecNumElements == VecNumElements && IdxN == 0) + return SubVec; + + for (unsigned I = 0; I < VecNumElements; ++I) { + Constant *Elt; + if (I < IdxN + SubVecNumElements) + Elt = SubVec->getAggregateElement(I - IdxN); + else + Elt = Vec->getAggregateElement(I); + if (!Elt) + return nullptr; + Result[I] = Elt; + } + return ConstantVector::get(Result); + } + case Intrinsic::vector_interleave2: { + unsigned NumElements = + cast(Operands[0]->getType())->getNumElements(); + for (unsigned I = 0; I < NumElements; ++I) { + Constant *Elt0 = Operands[0]->getAggregateElement(I); + Constant *Elt1 = Operands[1]->getAggregateElement(I); + if (!Elt0 || !Elt1) + return nullptr; + Result[2 * I] = Elt0; + Result[2 * I + 1] = Elt1; + } + return ConstantVector::get(Result); + } default: break; } @@ -3919,6 +3989,33 @@ ConstantFoldStructCall(StringRef Name, Intrinsic::ID IntrinsicID, return nullptr; return ConstantStruct::get(StTy, SinResult, CosResult); } + case Intrinsic::vector_deinterleave2: { + auto *Vec = dyn_cast(Operands[0]); + if (!Vec) + return nullptr; + + auto *VecTy = cast(Vec->getType()); + unsigned NumElements = VecTy->getElementCount().getKnownMinValue() / 2; + if (isa(Vec)) { + auto *HalfVecTy = VectorType::getHalfElementsVectorType(VecTy); + return ConstantStruct::get(StTy, ConstantAggregateZero::get(HalfVecTy), + ConstantAggregateZero::get(HalfVecTy)); + } + if (isa(Vec->getType())) { + SmallVector Res0(NumElements), Res1(NumElements); + for (unsigned I = 0; I < NumElements; ++I) { + Constant *Elt0 = Vec->getAggregateElement(2 * I); + Constant *Elt1 = Vec->getAggregateElement(2 * I + 1); + if (!Elt0 || !Elt1) + return nullptr; + Res0[I] = Elt0; + Res1[I] = Elt1; + } + return ConstantStruct::get(StTy, ConstantVector::get(Res0), + ConstantVector::get(Res1)); + } + return nullptr; + } default: // TODO: Constant folding of vector intrinsics that fall through here does // not work (e.g. overflow intrinsics) diff --git a/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll b/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll new file mode 100644 index 0000000000000..9dbe3d4e50ee1 --- /dev/null +++ b/llvm/test/Transforms/InstSimplify/ConstProp/vector-calls.ll @@ -0,0 +1,68 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -passes=instsimplify,verify -S | FileCheck %s + +define <3 x i32> @fold_vector_extract() { +; CHECK-LABEL: define <3 x i32> @fold_vector_extract() { +; CHECK-NEXT: ret <3 x i32> +; + %1 = call <3 x i32> @llvm.vector.extract.v3i32.v8i32(<8 x i32> , i64 3) + ret <3 x i32> %1 +} + +@a = external global i16, align 1 + +define <3 x i32> @fold_vector_extract_constexpr() { +; CHECK-LABEL: define <3 x i32> @fold_vector_extract_constexpr() { +; CHECK-NEXT: ret <3 x i32> +; + %1 = call <3 x i32> @llvm.vector.extract.v3i32.v8i32(<8 x i32> , i64 0) + ret <3 x i32> %1 +} + +define <8 x i32> @fold_vector_extract_nop() { +; CHECK-LABEL: define <8 x i32> @fold_vector_extract_nop() { +; CHECK-NEXT: ret <8 x i32> +; + %1 = call <8 x i32> @llvm.vector.extract.v3i32.v8i32(<8 x i32> , i64 0) + ret <8 x i32> %1 +} + +define <8 x i32> @fold_vector_insert() { +; CHECK-LABEL: define <8 x i32> @fold_vector_insert() { +; CHECK-NEXT: ret <8 x i32> +; + %1 = call <8 x i32> @llvm.vector.insert.v8i32(<8 x i32> , <4 x i32> , i64 0) + ret <8 x i32> %1 +} + +define <8 x i32> @fold_vector_insert_nop() { +; CHECK-LABEL: define <8 x i32> @fold_vector_insert_nop() { +; CHECK-NEXT: ret <8 x i32> +; + %1 = call <8 x i32> @llvm.vector.insert.v8i32(<8 x i32> , <8 x i32> , i64 0) + ret <8 x i32> %1 +} + +define <8 x i32> @fold_vector_interleave2() { +; CHECK-LABEL: define <8 x i32> @fold_vector_interleave2() { +; CHECK-NEXT: ret <8 x i32> +; + %1 = call <8 x i32> @llvm.vector.interleave2.v8i32(<4 x i32> , <4 x i32> ) + ret <8 x i32> %1 +} + +define {<4 x i32>, <4 x i32>} @fold_vector_deinterleave2() { +; CHECK-LABEL: define { <4 x i32>, <4 x i32> } @fold_vector_deinterleave2() { +; CHECK-NEXT: ret { <4 x i32>, <4 x i32> } { <4 x i32> , <4 x i32> } +; + %1 = call {<4 x i32>, <4 x i32>} @llvm.vector.deinterleave2.v4i32.v8i32(<8 x i32> ) + ret {<4 x i32>, <4 x i32>} %1 +} + +define {, } @fold_scalable_vector_deinterleave2() { +; CHECK-LABEL: define { , } @fold_scalable_vector_deinterleave2() { +; CHECK-NEXT: ret { , } zeroinitializer +; + %1 = call {, } @llvm.vector.deinterleave2.v4i32.v8i32( zeroinitializer) + ret {, } %1 +} From dc4335a2bf75c7b9928a72a7f15df0276120d7ed Mon Sep 17 00:00:00 2001 From: Joseph Huber Date: Wed, 11 Jun 2025 18:22:05 -0500 Subject: [PATCH 160/851] [libc] Perform bitfield zero initialization wave-parallel (#143607) Summary: We need to set the bitfield memory to zero because the system does not guarantee zeroed out memory. Even if fresh pages are zero, the system allows re-use so we would need a `kfd` level API to skip this step. Because we can't this patch updates the logic to perform the zero initialization wave-parallel. This reduces the amount of time it takes to allocate a fresh by up to a tenth. This has the unfortunate side effect that the control flow is more convoluted and we waste some extra registers, but it's worth it to reduce the slab allocation latency. --- libc/src/__support/GPU/allocator.cpp | 46 +++++++++++++++++++++------- 1 file changed, 35 insertions(+), 11 deletions(-) diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp index ecc0de1cb6ec3..66ab155e5c299 100644 --- a/libc/src/__support/GPU/allocator.cpp +++ b/libc/src/__support/GPU/allocator.cpp @@ -129,6 +129,14 @@ static inline constexpr T round_up(const T x) { return (x + N) & ~(N - 1); } +// Perform a lane parallel memset on a uint32_t pointer. +void uniform_memset(uint32_t *s, uint32_t c, uint32_t n, uint64_t uniform) { + uint64_t mask = gpu::get_lane_mask(); + uint32_t workers = cpp::popcount(uniform); + for (uint32_t i = impl::lane_count(mask & uniform); i < n; i += workers) + s[i] = c; +} + } // namespace impl /// A slab allocator used to hand out identically sized slabs of memory. @@ -157,10 +165,15 @@ struct Slab { Header *header = reinterpret_cast
(memory); header->chunk_size = chunk_size; header->global_index = global_index; + } - // This memset is expensive and likely not necessary for the current 'kfd' - // driver. Until zeroed pages are exposed by the API we must be careful. - __builtin_memset(get_bitfield(), 0, bitfield_bytes(chunk_size)); + // Set the necessary bitfield bytes to zero in parallel using many lanes. This + // must be called before the bitfield can be accessed safely, memory is not + // guaranteed to be zero initialized in the current implementation. + void initialize(uint64_t uniform) { + uint32_t size = (bitfield_bytes(get_chunk_size()) + sizeof(uint32_t) - 1) / + sizeof(uint32_t); + impl::uniform_memset(get_bitfield(), 0, size, uniform); } // Get the number of chunks that can theoretically fit inside this slab. @@ -354,14 +367,7 @@ struct GuardPtr { void *raw = impl::rpc_allocate(sizeof(Slab)); if (!raw) return nullptr; - Slab *mem = new (raw) Slab(cpp::forward(args)...); - - cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); - ptr.store(mem, cpp::MemoryOrder::RELAXED); - cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); - if (!ref.acquire(n, count)) - ref.reset(n, count); - return mem; + return new (raw) Slab(cpp::forward(args)...); } if (!expected || expected == reinterpret_cast(SENTINEL)) @@ -374,6 +380,16 @@ struct GuardPtr { return ptr.load(cpp::MemoryOrder::RELAXED); } + // Finalize the associated memory and signal that it is ready to use by + // resetting the counter. + void finalize(Slab *mem, uint32_t n, uint64_t &count) { + cpp::atomic_thread_fence(cpp::MemoryOrder::RELEASE); + ptr.store(mem, cpp::MemoryOrder::RELAXED); + cpp::atomic_thread_fence(cpp::MemoryOrder::ACQUIRE); + if (!ref.acquire(n, count)) + ref.reset(n, count); + } + public: // Attempt to lock access to the pointer, potentially creating it if empty. // The uniform mask represents which lanes share the same pointer. For each @@ -392,6 +408,14 @@ struct GuardPtr { if (!result) return nullptr; + // We defer storing the newly allocated slab until now so that we can use + // multiple lanes to initialize it and release it for use. + if (count == cpp::numeric_limits::max()) { + result->initialize(uniform); + if (gpu::get_lane_id() == uint32_t(cpp::countr_zero(uniform))) + finalize(result, cpp::popcount(uniform), count); + } + if (count != cpp::numeric_limits::max()) count = count - cpp::popcount(uniform) + impl::lane_count(uniform) + 1; From 1ecd108cb7ceda2b11281b5d173e2827feb60c55 Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Wed, 11 Jun 2025 16:22:17 -0700 Subject: [PATCH 161/851] [libc] Migrate stdio tests to ErrnoCheckingTest. (#143802) Reduce the direct use of libc_errno in stdio unit tests by adopting ErrnoCheckingTest where appropriate. Also removes the libc_errno.h inclusions from stdlib.h tests that were accidentally added in d87eea35fac5a34a841c637db8908128409a184e --- libc/test/src/stdio/CMakeLists.txt | 10 ++++++++++ libc/test/src/stdio/fdopen_test.cpp | 14 ++++++-------- libc/test/src/stdio/fgetc_test.cpp | 5 ++--- libc/test/src/stdio/fgetc_unlocked_test.cpp | 5 ++--- libc/test/src/stdio/fgets_test.cpp | 6 +++--- libc/test/src/stdio/fileop_test.cpp | 20 +++++--------------- libc/test/src/stdio/fopencookie_test.cpp | 15 +++++++-------- libc/test/src/stdio/remove_test.cpp | 10 +++++----- libc/test/src/stdio/rename_test.cpp | 9 +++++---- libc/test/src/stdio/setvbuf_test.cpp | 8 ++++---- libc/test/src/stdio/unlocked_fileop_test.cpp | 7 +++---- libc/test/src/stdlib/StrtolTest.h | 1 - libc/test/src/stdlib/strtold_test.cpp | 1 - 13 files changed, 52 insertions(+), 59 deletions(-) diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt index 01904a30504ed..3627006ec28fd 100644 --- a/libc/test/src/stdio/CMakeLists.txt +++ b/libc/test/src/stdio/CMakeLists.txt @@ -20,6 +20,7 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fseek libc.src.stdio.fwrite + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -68,6 +69,7 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fwrite libc.src.stdio.setvbuf + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -88,6 +90,7 @@ add_libc_test( libc.src.stdio.fread_unlocked libc.src.stdio.funlockfile libc.src.stdio.fwrite_unlocked + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -109,6 +112,7 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fseek libc.src.stdio.fwrite + libc.test.UnitTest.ErrnoCheckingTest LINK_LIBRARIES LibcMemoryHelpers ) @@ -426,6 +430,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.sys.stat.mkdirat libc.src.unistd.access libc.src.unistd.close + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -440,6 +445,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.stdio.rename libc.src.unistd.access libc.src.unistd.close + libc.test.UnitTest.ErrnoCheckingTest libc.test.UnitTest.ErrnoSetterMatcher ) @@ -456,6 +462,7 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.stdio.fgets libc.src.stdio.fputs libc.src.unistd.close + libc.test.UnitTest.ErrnoCheckingTest libc.test.UnitTest.ErrnoSetterMatcher ) endif() @@ -476,6 +483,7 @@ add_libc_test( libc.src.stdio.fopen libc.src.stdio.fwrite libc.src.stdio.getc + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -498,6 +506,7 @@ add_libc_test( libc.src.stdio.funlockfile libc.src.stdio.fwrite libc.src.stdio.getc_unlocked + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -515,6 +524,7 @@ add_libc_test( libc.src.stdio.fgets libc.src.stdio.fopen libc.src.stdio.fwrite + libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp index 104fc478b100e..b53184c30be36 100644 --- a/libc/test/src/stdio/fdopen_test.cpp +++ b/libc/test/src/stdio/fdopen_test.cpp @@ -9,20 +9,21 @@ #include "src/stdio/fdopen.h" #include "hdr/fcntl_macros.h" -#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/fclose.h" #include "src/stdio/fgets.h" #include "src/stdio/fputs.h" #include "src/unistd/close.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include // For S_IRWXU -TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) { +using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; - libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); @@ -52,8 +53,7 @@ TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) { ASSERT_ERRNO_SUCCESS(); } -TEST(LlvmLibcStdioFdopenTest, InvalidFd) { - libc_errno = 0; +TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) { constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC); @@ -64,8 +64,7 @@ TEST(LlvmLibcStdioFdopenTest, InvalidFd) { ASSERT_TRUE(nullptr == fp); } -TEST(LlvmLibcStdioFdopenTest, InvalidMode) { - libc_errno = 0; +TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) { constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU); @@ -83,7 +82,6 @@ TEST(LlvmLibcStdioFdopenTest, InvalidMode) { auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w"); ASSERT_ERRNO_EQ(EINVAL); ASSERT_TRUE(nullptr == fp2); - libc_errno = 0; LIBC_NAMESPACE::close(fd); ASSERT_ERRNO_SUCCESS(); } diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp index 56bde5f0099a8..7c652f666a8f3 100644 --- a/libc/test/src/stdio/fgetc_test.cpp +++ b/libc/test/src/stdio/fgetc_test.cpp @@ -14,12 +14,12 @@ #include "src/stdio/fopen.h" #include "src/stdio/fwrite.h" #include "src/stdio/getc.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/__support/libc_errno.h" -class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { +class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { public: using GetcFunc = int(FILE *); void test_with_func(GetcFunc *func, const char *filename) { @@ -33,7 +33,6 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp index 90429ecf4e82b..f4471dd82df15 100644 --- a/libc/test/src/stdio/fgetc_unlocked_test.cpp +++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp @@ -17,12 +17,12 @@ #include "src/stdio/funlockfile.h" #include "src/stdio/fwrite.h" #include "src/stdio/getc_unlocked.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/__support/libc_errno.h" -class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { +class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { public: using GetcFunc = int(FILE *); void test_with_func(GetcFunc *func, const char *filename) { @@ -36,7 +36,6 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp index abed3d4052939..c00a9256af52d 100644 --- a/libc/test/src/stdio/fgets_test.cpp +++ b/libc/test/src/stdio/fgets_test.cpp @@ -12,11 +12,12 @@ #include "src/stdio/fgets.h" #include "src/stdio/fopen.h" #include "src/stdio/fwrite.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" -#include "src/__support/libc_errno.h" +using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; -TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) { +TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) { constexpr char FILENAME[] = "testdata/fgets.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); @@ -35,7 +36,6 @@ TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp index e624181c795b8..e097785832d56 100644 --- a/libc/test/src/stdio/fileop_test.cpp +++ b/libc/test/src/stdio/fileop_test.cpp @@ -17,17 +17,18 @@ #include "src/stdio/fread.h" #include "src/stdio/fseek.h" #include "src/stdio/fwrite.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/__support/libc_errno.h" +using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns; -TEST(LlvmLibcFILETest, SimpleFileOperations) { +TEST_F(LlvmLibcFILETest, SimpleFileOperations) { constexpr char FILENAME[] = "testdata/simple_operations.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); @@ -41,7 +42,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file), returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - libc_errno = 0; LIBC_NAMESPACE::clearerr(file); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); @@ -72,7 +72,6 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file), returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - libc_errno = 0; LIBC_NAMESPACE::clearerr(file); @@ -80,15 +79,12 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file), returns(EQ(EOF)).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - libc_errno = 0; LIBC_NAMESPACE::clearerr(file); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); - libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file), returns(EQ(size_t(0))).with_errno(NE(0))); - libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); @@ -103,10 +99,8 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); // This is not a readable file. - libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file), returns(EQ(0)).with_errno(NE(0))); - libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); @@ -121,21 +115,18 @@ TEST(LlvmLibcFILETest, SimpleFileOperations) { // Check that the other functions correctly set libc_errno. - // libc_errno = 0; // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0); // ASSERT_ERRNO_FAILURE(); - // libc_errno = 0; // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0); // ASSERT_ERRNO_FAILURE(); - // libc_errno = 0; // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"), // static_cast(nullptr)); // ASSERT_ERRNO_FAILURE(); } -TEST(LlvmLibcFILETest, FFlush) { +TEST_F(LlvmLibcFILETest, FFlush) { constexpr char FILENAME[] = "testdata/fflush.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+"); ASSERT_FALSE(file == nullptr); @@ -156,7 +147,7 @@ TEST(LlvmLibcFILETest, FFlush) { ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); } -TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { +TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { using MyStruct = struct { char c; unsigned long long i; @@ -165,7 +156,6 @@ TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct); constexpr char FILENAME[] = "testdata/fread_fwrite.test"; - libc_errno = 0; FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file)); diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp index 03e1ac286b646..bcf5e674141a7 100644 --- a/libc/test/src/stdio/fopencookie_test.cpp +++ b/libc/test/src/stdio/fopencookie_test.cpp @@ -15,6 +15,7 @@ #include "src/stdio/fread.h" #include "src/stdio/fseek.h" #include "src/stdio/fwrite.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/MemoryMatcher.h" #include "test/UnitTest/Test.h" @@ -22,6 +23,7 @@ #include "hdr/types/size_t.h" #include "src/__support/libc_errno.h" +using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; using MemoryView = LIBC_NAMESPACE::testing::MemoryView; struct StringStream { @@ -88,7 +90,7 @@ int close_ss(void *cookie) { constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss, &seek_ss, &close_ss}; -TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) { +TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) { constexpr char CONTENT[] = "Hello,readonly!"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(CONTENT))); @@ -115,7 +117,6 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) { ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_FAILURE(); - libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -124,7 +125,7 @@ TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) { free(ss); } -TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) { +TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) { size_t INIT_BUFSIZE = 32; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(INIT_BUFSIZE)); @@ -149,7 +150,6 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) { LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_EQ(EBADF); - libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -158,7 +158,7 @@ TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) { free(ss); } -TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) { +TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) { constexpr char INITIAL_CONTENT[] = "1234567890987654321"; constexpr char WRITE_DATA[] = "append"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); @@ -178,7 +178,6 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) { ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_FAILURE(); - libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -192,7 +191,7 @@ TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) { free(ss); } -TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) { +TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) { const char INITIAL_CONTENT[] = "1234567890987654321"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(INITIAL_CONTENT))); @@ -223,7 +222,7 @@ TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) { free(ss); } -TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) { +TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) { constexpr char WRITE_DATA[] = "hello, file"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(WRITE_DATA))); diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp index 84984e26398c0..296bff1f5dc15 100644 --- a/libc/test/src/stdio/remove_test.cpp +++ b/libc/test/src/stdio/remove_test.cpp @@ -11,16 +11,17 @@ #include "src/sys/stat/mkdirat.h" #include "src/unistd/access.h" #include "src/unistd/close.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -#include "src/__support/libc_errno.h" #include -TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) { +using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) { // The test strategy is to create a file and remove it, and also verify that // it was removed. - libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; @@ -36,10 +37,9 @@ TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) { ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT)); } -TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) { +TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) { // The test strategy is to create a dir and remove it, and also verify that // it was removed. - libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; constexpr const char *FILENAME = "remove.test.dir"; diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp index ac494a4ecaf8e..135fb98c07fbb 100644 --- a/libc/test/src/stdio/rename_test.cpp +++ b/libc/test/src/stdio/rename_test.cpp @@ -8,18 +8,19 @@ #include "include/llvm-libc-macros/linux/sys-stat-macros.h" #include "include/llvm-libc-macros/linux/unistd-macros.h" -#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/rename.h" #include "src/unistd/access.h" #include "src/unistd/close.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -TEST(LlvmLibcRenameTest, CreateAndRenameFile) { +using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) { // The test strategy is to create a file and rename it, and also verify that // it was renamed. - libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; @@ -40,7 +41,7 @@ TEST(LlvmLibcRenameTest, CreateAndRenameFile) { ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT)); } -TEST(LlvmLibcRenameTest, RenameNonExistent) { +TEST_F(LlvmLibcRenameTest, RenameNonExistent) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; constexpr const char *FILENAME1 = "rename.test.file1"; diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp index 5872943c1bb41..4144bc1bef447 100644 --- a/libc/test/src/stdio/setvbuf_test.cpp +++ b/libc/test/src/stdio/setvbuf_test.cpp @@ -14,9 +14,10 @@ #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" -#include "src/__support/libc_errno.h" -TEST(LlvmLibcSetvbufTest, SetNBFBuffer) { +using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; + +TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) { // The idea in this test is that we open a file for writing and reading, and // then set a NBF buffer to the write handle. Since it is NBF, the data // written using the write handle should be immediately readable by the read @@ -52,7 +53,7 @@ TEST(LlvmLibcSetvbufTest, SetNBFBuffer) { ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr)); } -TEST(LlvmLibcSetvbufTest, SetLBFBuffer) { +TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) { // The idea in this test is that we open a file for writing and reading, and // then set a LBF buffer to the write handle. Since it is LBF, the data // written using the write handle should be available right after a '\n' is @@ -102,6 +103,5 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) { 0); ASSERT_ERRNO_EQ(EINVAL); - libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f)); } diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp index 5d482b70064bd..e99b382d12112 100644 --- a/libc/test/src/stdio/unlocked_fileop_test.cpp +++ b/libc/test/src/stdio/unlocked_fileop_test.cpp @@ -15,11 +15,12 @@ #include "src/stdio/fread_unlocked.h" #include "src/stdio/funlockfile.h" #include "src/stdio/fwrite_unlocked.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" -#include "src/__support/libc_errno.h" +using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; -TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { +TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) { constexpr char fNAME[] = "testdata/unlocked_read_and_write.test"; ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w"); ASSERT_FALSE(f == nullptr); @@ -36,7 +37,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f)); ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0); ASSERT_ERRNO_FAILURE(); - libc_errno = 0; LIBC_NAMESPACE::clearerr_unlocked(f); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0); @@ -57,7 +57,6 @@ TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f)); ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0); ASSERT_ERRNO_FAILURE(); - libc_errno = 0; LIBC_NAMESPACE::clearerr_unlocked(f); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0); diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h index 3eeccc5727e77..03f0a6539c785 100644 --- a/libc/test/src/stdlib/StrtolTest.h +++ b/libc/test/src/stdlib/StrtolTest.h @@ -9,7 +9,6 @@ #include "src/__support/CPP/limits.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/ctype_utils.h" -#include "src/__support/libc_errno.h" #include "src/__support/macros/properties/architectures.h" #include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp index c2f2b9c9a11c3..eb4056dc7ba64 100644 --- a/libc/test/src/stdlib/strtold_test.cpp +++ b/libc/test/src/stdlib/strtold_test.cpp @@ -7,7 +7,6 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" -#include "src/__support/libc_errno.h" #include "src/__support/uint128.h" #include "src/stdlib/strtold.h" From 3c7af175e51c3ab08ac3c442146c2b822f38c01e Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Wed, 11 Jun 2025 16:52:21 -0700 Subject: [PATCH 162/851] [libc] Fix stdio tests after #143802 (#143810) In #143802 the stdio test cleanup missed a few places where errno was being set to a failing value, and one where the framework needed to included. --- libc/docs/configure.rst | 2 +- libc/test/src/stdio/fgetc_test.cpp | 1 + libc/test/src/stdio/fgetc_unlocked_test.cpp | 1 + libc/test/src/stdio/fgets_test.cpp | 1 + libc/test/src/stdio/setvbuf_test.cpp | 1 + 5 files changed, 5 insertions(+), 1 deletion(-) diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst index 8d53390ae19bf..109412225634f 100644 --- a/libc/docs/configure.rst +++ b/libc/docs/configure.rst @@ -29,7 +29,7 @@ to learn about the defaults for your platform and target. - ``LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR``: Enable -fstack-protector-strong to defend against stack smashing attack. - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience. * **"errno" options** - - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM. + - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE. * **"general" options** - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior. * **"math" options** diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp index 7c652f666a8f3..1faa49112fb63 100644 --- a/libc/test/src/stdio/fgetc_test.cpp +++ b/libc/test/src/stdio/fgetc_test.cpp @@ -33,6 +33,7 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + ASSERT_ERRNO_FAILURE(); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp index f4471dd82df15..7b2efe642fb5e 100644 --- a/libc/test/src/stdio/fgetc_unlocked_test.cpp +++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp @@ -36,6 +36,7 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + ASSERT_ERRNO_FAILURE(); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp index c00a9256af52d..2d7c68d490811 100644 --- a/libc/test/src/stdio/fgets_test.cpp +++ b/libc/test/src/stdio/fgets_test.cpp @@ -36,6 +36,7 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + ASSERT_ERRNO_FAILURE(); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp index 4144bc1bef447..a0936ba79ef73 100644 --- a/libc/test/src/stdio/setvbuf_test.cpp +++ b/libc/test/src/stdio/setvbuf_test.cpp @@ -11,6 +11,7 @@ #include "src/stdio/fread.h" #include "src/stdio/fwrite.h" #include "src/stdio/setvbuf.h" +#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" From 6c72084a578a7a1e4dc1013a1a4a30b72ad5c6ab Mon Sep 17 00:00:00 2001 From: Jorge Gorbe Moya Date: Wed, 11 Jun 2025 16:56:37 -0700 Subject: [PATCH 163/851] [bazel] port 1ecd108cb7ceda2b11281b5d173e2827feb60c55 --- utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel index 484d3e5e0a24e..505b73fd77111 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/libc/test/src/stdio/BUILD.bazel @@ -122,6 +122,7 @@ libc_test( "//libc:mkdirat", "//libc:open", "//libc:remove", + "//libc/test/UnitTest:errno_test_helpers", ], ) From bc7ea63e9c885fbe71dec29581a206bc0543d22a Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 11 Jun 2025 20:04:27 -0400 Subject: [PATCH 164/851] [MemCpyOpt] handle memcpy from memset for non-constant sizes (#143727) Allows forwarding memset to memcpy for mismatching unknown sizes if overread has undef contents. In that case we can refine the undef bytes to the memset value. Refs #140954 which laid some of the groundwork for this. --- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 38 +++++++++---------- .../MemCpyOpt/variable-sized-memset-memcpy.ll | 6 +-- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 960001bf880c6..1c4ec6aa08b43 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -1440,7 +1440,7 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, int64_t MOffset = 0; const DataLayout &DL = MemCpy->getModule()->getDataLayout(); // We can only transforms memcpy's where the dest of one is the source of the - // other, or the memory transfer has a known offset from the memset. + // other, or they have a known offset. if (MemCpy->getSource() != MemSet->getDest()) { std::optional Offset = MemCpy->getSource()->getPointerOffsetFrom(MemSet->getDest(), DL); @@ -1451,28 +1451,28 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy, if (MOffset != 0 || MemSetSize != CopySize) { // Make sure the memcpy doesn't read any more than what the memset wrote, - // other than undef. Don't worry about sizes larger than i64. A known memset - // size is required. + // other than undef. Don't worry about sizes larger than i64. auto *CMemSetSize = dyn_cast(MemSetSize); - if (!CMemSetSize) - return false; - - // A known memcpy size is also required. auto *CCopySize = dyn_cast(CopySize); - if (!CCopySize) - return false; - if (CCopySize->getZExtValue() + MOffset > CMemSetSize->getZExtValue()) { + if (!CMemSetSize || !CCopySize || + CCopySize->getZExtValue() + MOffset > CMemSetSize->getZExtValue()) { if (!overreadUndefContents(MSSA, MemCpy, MemSet, BAA)) return false; - // Clip the memcpy to the bounds of the memset - if (MOffset == 0) - CopySize = MemSetSize; - else - CopySize = - ConstantInt::get(CopySize->getType(), - CMemSetSize->getZExtValue() <= (uint64_t)MOffset - ? 0 - : CMemSetSize->getZExtValue() - MOffset); + + if (CMemSetSize && CCopySize) { + // If both have constant sizes and offsets, clip the memcpy to the + // bounds of the memset if applicable. + assert(CCopySize->getZExtValue() + MOffset > + CMemSetSize->getZExtValue()); + if (MOffset == 0) + CopySize = MemSetSize; + else + CopySize = + ConstantInt::get(CopySize->getType(), + CMemSetSize->getZExtValue() <= (uint64_t)MOffset + ? 0 + : CMemSetSize->getZExtValue() - MOffset); + } } } diff --git a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll index d5b1ab9b2f299..4b44f8b44f74a 100644 --- a/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/variable-sized-memset-memcpy.ll @@ -19,12 +19,12 @@ define void @test(ptr %src, i8 %c, i64 %size) { } ; Differing sizes, but would be UB if size1 < size2 since the memcpy would reference outside of the first alloca -define void @negative_test(ptr %src, i8 %c, i64 %size1, i64 %size2) { -; CHECK-LABEL: @negative_test( +define void @dynsize_test(ptr %src, i8 %c, i64 %size1, i64 %size2) { +; CHECK-LABEL: @dynsize_test( ; CHECK-NEXT: [[DST1:%.*]] = alloca i8, i64 [[SIZE1:%.*]], align 1 ; CHECK-NEXT: [[DST2:%.*]] = alloca i8, i64 [[SIZE2:%.*]], align 1 ; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[DST1]], i8 [[C:%.*]], i64 [[SIZE1]], i1 false) -; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr align 8 [[DST2]], ptr align 8 [[DST1]], i64 [[SIZE2]], i1 false) +; CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 8 [[DST2]], i8 [[C]], i64 [[SIZE2]], i1 false) ; CHECK-NEXT: ret void ; %dst1 = alloca i8, i64 %size1 From d7c6cad744bc7ed28535dc6f75629902eda559ea Mon Sep 17 00:00:00 2001 From: Jake Egan Date: Wed, 11 Jun 2025 20:22:15 -0400 Subject: [PATCH 165/851] [sanitizer_common] Implement interception on AIX (#138606) Adjust AIX interceptor support in sanitizer_common. Issue: https://github.com/llvm/llvm-project/issues/138916 --- .../sanitizer_common_interceptors.inc | 43 ++++++++----- .../sanitizer_common_interceptors_ioctl.inc | 2 + ...izer_common_interceptors_memintrinsics.inc | 8 ++- .../sanitizer_platform_interceptors.h | 61 +++++++++++-------- .../sanitizer_redefine_builtins.h | 2 +- 5 files changed, 73 insertions(+), 43 deletions(-) diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc index 9272e2ab6cbd5..2d6cf7fc3282f 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc @@ -481,7 +481,8 @@ INTERCEPTOR(char*, textdomain, const char *domainname) { #endif #if SANITIZER_INTERCEPT_STRCMP || SANITIZER_INTERCEPT_MEMCMP -static inline int CharCmpX(unsigned char c1, unsigned char c2) { +[[maybe_unused]] static inline int CharCmpX(unsigned char c1, + unsigned char c2) { return (c1 == c2) ? 0 : (c1 < c2) ? -1 : 1; } #endif @@ -1350,7 +1351,8 @@ INTERCEPTOR(unsigned long, time, unsigned long *t) { #if SANITIZER_INTERCEPT_LOCALTIME_AND_FRIENDS static void unpoison_tm(void *ctx, __sanitizer_tm *tm) { COMMON_INTERCEPTOR_WRITE_RANGE(ctx, tm, sizeof(*tm)); -#if !SANITIZER_SOLARIS +// AIX tm struct does not have tm_zone field. +# if !SANITIZER_SOLARIS && !SANITIZER_AIX if (tm->tm_zone) { // Can not use COMMON_INTERCEPTOR_WRITE_RANGE here, because tm->tm_zone // can point to shared memory and tsan would report a data race. @@ -1735,10 +1737,12 @@ INTERCEPTOR(int, __vsprintf_chk, char *str, int flag, SIZE_T size_to, VSPRINTF_INTERCEPTOR_IMPL(vsprintf, str, format, ap) #endif +# if SANITIZER_INTERCEPT_VASPRINTF INTERCEPTOR(int, vasprintf, char **strp, const char *format, va_list ap) VASPRINTF_INTERCEPTOR_IMPL(vasprintf, strp, format, ap) +# endif -#if SANITIZER_INTERCEPT_ISOC99_PRINTF +# if SANITIZER_INTERCEPT_ISOC99_PRINTF INTERCEPTOR(int, __isoc99_vprintf, const char *format, va_list ap) VPRINTF_INTERCEPTOR_IMPL(__isoc99_vprintf, format, ap) @@ -1787,10 +1791,12 @@ INTERCEPTOR(int, __snprintf_chk, char *str, SIZE_T size, int flag, FORMAT_INTERCEPTOR_IMPL(__snprintf_chk, vsnprintf, str, size, format) #endif +# if SANITIZER_INTERCEPT_ASPRINTF INTERCEPTOR(int, asprintf, char **strp, const char *format, ...) FORMAT_INTERCEPTOR_IMPL(asprintf, vasprintf, strp, format) +# endif -#if SANITIZER_INTERCEPT_ISOC99_PRINTF +# if SANITIZER_INTERCEPT_ISOC99_PRINTF INTERCEPTOR(int, __isoc99_printf, const char *format, ...) FORMAT_INTERCEPTOR_IMPL(__isoc99_printf, __isoc99_vprintf, format) @@ -1811,17 +1817,24 @@ FORMAT_INTERCEPTOR_IMPL(__isoc99_snprintf, __isoc99_vsnprintf, str, size, #endif // SANITIZER_INTERCEPT_PRINTF #if SANITIZER_INTERCEPT_PRINTF -#define INIT_PRINTF \ - COMMON_INTERCEPT_FUNCTION_LDBL(printf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(sprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(snprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(asprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(fprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(vprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(vsprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(vsnprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(vasprintf); \ - COMMON_INTERCEPT_FUNCTION_LDBL(vfprintf); +# define INIT_PRINTF_COMMON \ + COMMON_INTERCEPT_FUNCTION_LDBL(printf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(sprintf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(snprintf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(fprintf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(vprintf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(vsprintf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(vsnprintf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(vfprintf); +# if !SANITIZER_AIX +// AIX does not have [v]asprintf. +# define INIT_PRINTF_EXTRA \ + COMMON_INTERCEPT_FUNCTION_LDBL(asprintf); \ + COMMON_INTERCEPT_FUNCTION_LDBL(vasprintf); +# else +# define INIT_PRINTF_EXTRA +# endif +# define INIT_PRINTF INIT_PRINTF_COMMON INIT_PRINTF_EXTRA #else #define INIT_PRINTF #endif diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc index bc8f02826c614..08c2be47f5358 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_ioctl.inc @@ -79,7 +79,9 @@ static void ioctl_table_fill() { _(TIOCMSET, READ, sizeof(int)); _(TIOCNXCL, NONE, 0); _(TIOCOUTQ, WRITE, sizeof(int)); +# if !SANITIZER_AIX _(TIOCSCTTY, NONE, 0); +# endif _(TIOCSPGRP, READ, pid_t_sz); _(TIOCSWINSZ, READ, struct_winsize_sz); diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc index 1565a494140f6..0b6731c89950b 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc +++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors_memintrinsics.inc @@ -33,11 +33,13 @@ // Platform-specific options. #if SANITIZER_APPLE -#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0 +# define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0 #elif SANITIZER_WINDOWS64 -#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0 +# define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0 +#elif SANITIZER_AIX +# define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 0 #else -#define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 1 +# define PLATFORM_HAS_DIFFERENT_MEMCPY_AND_MEMMOVE 1 #endif // SANITIZER_APPLE #ifndef COMMON_INTERCEPTOR_MEMSET_IMPL diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h index 4bc55d7801db7..ccc808b60ca75 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h @@ -141,6 +141,12 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SI_SOLARIS 0 #endif +#if SANITIZER_AIX +# define SI_NOT_AIX 0 +#else +# define SI_NOT_AIX 1 +#endif + #if SANITIZER_SOLARIS32 #define SI_SOLARIS32 1 #else @@ -161,20 +167,20 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_STRLEN SI_NOT_FUCHSIA #define SANITIZER_INTERCEPT_STRNLEN (SI_NOT_MAC && SI_NOT_FUCHSIA) -#define SANITIZER_INTERCEPT_STRCMP SI_NOT_FUCHSIA +#define SANITIZER_INTERCEPT_STRCMP (SI_NOT_FUCHSIA && SI_NOT_AIX) #define SANITIZER_INTERCEPT_STRSTR SI_NOT_FUCHSIA -#define SANITIZER_INTERCEPT_STRCASESTR SI_POSIX +#define SANITIZER_INTERCEPT_STRCASESTR (SI_POSIX && SI_NOT_AIX) #define SANITIZER_INTERCEPT_STRTOK SI_NOT_FUCHSIA #define SANITIZER_INTERCEPT_STRCHR SI_NOT_FUCHSIA -#define SANITIZER_INTERCEPT_STRCHRNUL SI_POSIX_NOT_MAC +#define SANITIZER_INTERCEPT_STRCHRNUL (SI_POSIX_NOT_MAC && SI_NOT_AIX) #define SANITIZER_INTERCEPT_STRRCHR SI_NOT_FUCHSIA #define SANITIZER_INTERCEPT_STRSPN SI_NOT_FUCHSIA #define SANITIZER_INTERCEPT_STRPBRK SI_NOT_FUCHSIA #define SANITIZER_INTERCEPT_TEXTDOMAIN SI_LINUX_NOT_ANDROID || SI_SOLARIS #define SANITIZER_INTERCEPT_STRCASECMP SI_POSIX #define SANITIZER_INTERCEPT_MEMSET 1 -#define SANITIZER_INTERCEPT_MEMMOVE 1 -#define SANITIZER_INTERCEPT_MEMCPY 1 +#define SANITIZER_INTERCEPT_MEMMOVE SI_NOT_AIX +#define SANITIZER_INTERCEPT_MEMCPY SI_NOT_AIX #define SANITIZER_INTERCEPT_MEMCMP SI_NOT_FUCHSIA #define SANITIZER_INTERCEPT_BCMP \ SANITIZER_INTERCEPT_MEMCMP && \ @@ -233,9 +239,11 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_ISOC99_SCANF SI_GLIBC #ifndef SANITIZER_INTERCEPT_PRINTF -#define SANITIZER_INTERCEPT_PRINTF SI_POSIX -#define SANITIZER_INTERCEPT_PRINTF_L (SI_FREEBSD || SI_NETBSD) -#define SANITIZER_INTERCEPT_ISOC99_PRINTF SI_GLIBC +# define SANITIZER_INTERCEPT_ASPRINTF SI_NOT_AIX +# define SANITIZER_INTERCEPT_VASPRINTF SI_NOT_AIX +# define SANITIZER_INTERCEPT_PRINTF SI_POSIX +# define SANITIZER_INTERCEPT_PRINTF_L (SI_FREEBSD || SI_NETBSD) +# define SANITIZER_INTERCEPT_ISOC99_PRINTF SI_GLIBC #endif #define SANITIZER_INTERCEPT_SETPROCTITLE (SI_FREEBSD || SI_NETBSD) @@ -243,8 +251,9 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT___PRINTF_CHK \ (SANITIZER_INTERCEPT_PRINTF && SI_GLIBC) -#define SANITIZER_INTERCEPT_FREXP SI_NOT_FUCHSIA -#define SANITIZER_INTERCEPT_FREXPF SI_POSIX +// AIX libc does not export FREXP and FREXPF. +#define SANITIZER_INTERCEPT_FREXP (SI_NOT_FUCHSIA && SI_NOT_AIX) +#define SANITIZER_INTERCEPT_FREXPF (SI_POSIX && SI_NOT_AIX) #define SANITIZER_INTERCEPT_FREXPL SI_POSIX #define SANITIZER_INTERCEPT_GETPWNAM_AND_FRIENDS SI_POSIX @@ -294,7 +303,7 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_ACCEPT4 \ (SI_LINUX_NOT_ANDROID || SI_NETBSD || SI_FREEBSD) #define SANITIZER_INTERCEPT_PACCEPT SI_NETBSD -#define SANITIZER_INTERCEPT_MODF SI_POSIX +#define SANITIZER_INTERCEPT_MODF (SI_POSIX && SI_NOT_AIX) #define SANITIZER_INTERCEPT_RECVMSG SI_POSIX #define SANITIZER_INTERCEPT_SENDMSG SI_POSIX #define SANITIZER_INTERCEPT_RECVMMSG SI_LINUX @@ -329,8 +338,9 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT___WCSXFRM_L SI_LINUX #define SANITIZER_INTERCEPT_WCSNRTOMBS \ (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS) -#define SANITIZER_INTERCEPT_WCRTOMB \ - (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS) +#define SANITIZER_INTERCEPT_WCRTOMB \ + (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS || \ + !SI_NOT_AIX) #define SANITIZER_INTERCEPT_WCTOMB \ (SI_FREEBSD || SI_NETBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS) #define SANITIZER_INTERCEPT_TCGETATTR SI_LINUX_NOT_ANDROID || SI_SOLARIS @@ -370,7 +380,8 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_GETMNTENT_R SI_LINUX_NOT_ANDROID #define SANITIZER_INTERCEPT_STATFS \ (SI_FREEBSD || SI_MAC || SI_LINUX_NOT_ANDROID || SI_SOLARIS) -#define SANITIZER_INTERCEPT_STATFS64 SI_GLIBC && SANITIZER_HAS_STATFS64 +#define SANITIZER_INTERCEPT_STATFS64 \ + ((SI_GLIBC || !SI_NOT_AIX) && SANITIZER_HAS_STATFS64) #define SANITIZER_INTERCEPT_STATVFS \ (SI_FREEBSD || SI_NETBSD || SI_LINUX_NOT_ANDROID) #define SANITIZER_INTERCEPT_STATVFS64 SI_GLIBC @@ -419,10 +430,10 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_TTYNAME_R SI_POSIX #define SANITIZER_INTERCEPT_TEMPNAM SI_POSIX #define SANITIZER_INTERCEPT_SINCOS SI_LINUX || SI_SOLARIS -#define SANITIZER_INTERCEPT_REMQUO SI_POSIX -#define SANITIZER_INTERCEPT_REMQUOL (SI_POSIX && !SI_NETBSD) -#define SANITIZER_INTERCEPT_LGAMMA SI_POSIX -#define SANITIZER_INTERCEPT_LGAMMAL (SI_POSIX && !SI_NETBSD) +#define SANITIZER_INTERCEPT_REMQUO (SI_POSIX && SI_NOT_AIX) +#define SANITIZER_INTERCEPT_REMQUOL (SI_POSIX && !SI_NETBSD && SI_NOT_AIX) +#define SANITIZER_INTERCEPT_LGAMMA (SI_POSIX && SI_NOT_AIX) +#define SANITIZER_INTERCEPT_LGAMMAL (SI_POSIX && !SI_NETBSD && SI_NOT_AIX) #define SANITIZER_INTERCEPT_LGAMMA_R (SI_FREEBSD || SI_LINUX || SI_SOLARIS) #define SANITIZER_INTERCEPT_LGAMMAL_R SI_LINUX_NOT_ANDROID || SI_SOLARIS #define SANITIZER_INTERCEPT_DRAND48_R SI_GLIBC @@ -505,11 +516,13 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_EVENTFD_READ_WRITE (SI_LINUX || SI_FREEBSD) #define SI_STAT_LINUX (SI_LINUX && __GLIBC_PREREQ(2, 33)) -#define SANITIZER_INTERCEPT_STAT \ - (SI_FREEBSD || SI_MAC || SI_ANDROID || SI_NETBSD || SI_SOLARIS || \ - SI_STAT_LINUX) -#define SANITIZER_INTERCEPT_STAT64 SI_STAT_LINUX && SANITIZER_HAS_STAT64 -#define SANITIZER_INTERCEPT_LSTAT (SI_NETBSD || SI_FREEBSD || SI_STAT_LINUX) +#define SANITIZER_INTERCEPT_STAT \ + (SI_FREEBSD || SI_MAC || SI_ANDROID || SI_NETBSD || SI_SOLARIS || \ + SI_STAT_LINUX || !SI_NOT_AIX) +#define SANITIZER_INTERCEPT_STAT64 \ + ((SI_STAT_LINUX || !SI_NOT_AIX) && SANITIZER_HAS_STAT64) +#define SANITIZER_INTERCEPT_LSTAT \ + (SI_NETBSD || SI_FREEBSD || SI_STAT_LINUX || !SI_NOT_AIX) #define SANITIZER_INTERCEPT___XSTAT \ ((!SANITIZER_INTERCEPT_STAT && SI_POSIX) || SI_STAT_LINUX) #define SANITIZER_INTERCEPT___XSTAT64 SI_GLIBC @@ -578,7 +591,7 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment, #define SANITIZER_INTERCEPT_PROTOENT_R SI_GLIBC #define SANITIZER_INTERCEPT_NETENT (SI_LINUX || SI_NETBSD || SI_FREEBSD) #define SANITIZER_INTERCEPT_SETVBUF \ - (SI_NETBSD || SI_FREEBSD || SI_LINUX || SI_MAC) + (SI_NETBSD || SI_FREEBSD || SI_LINUX || SI_MAC || !SI_NOT_AIX) #define SANITIZER_INTERCEPT_GETMNTINFO (SI_NETBSD || SI_FREEBSD || SI_MAC) #define SANITIZER_INTERCEPT_MI_VECTOR_HASH SI_NETBSD #define SANITIZER_INTERCEPT_GETVFSSTAT SI_NETBSD diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h b/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h index 41e0613d6fc13..bda0f04687693 100644 --- a/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h +++ b/compiler-rt/lib/sanitizer_common/sanitizer_redefine_builtins.h @@ -15,7 +15,7 @@ # define SANITIZER_REDEFINE_BUILTINS_H // The asm hack only works with GCC and Clang. -# if !defined(_WIN32) +# if !defined(_WIN32) && !defined(_AIX) asm(R"( .set memcpy, __sanitizer_internal_memcpy From 7a3bcf9f7179e6904d405de36360714da07c31ba Mon Sep 17 00:00:00 2001 From: Jim Lin Date: Wed, 11 Jun 2025 21:50:35 +0800 Subject: [PATCH 166/851] [RISCV] Add missing predicate for PseudoTHVdotVMAQA family instructions --- llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td | 2 ++ 1 file changed, 2 insertions(+) diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td index 2fccbcaf2cf37..89441444a994e 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXTHead.td @@ -660,10 +660,12 @@ def : Pat<(i32 (sub GPR:$rd, (mul (sexti16 (i32 GPR:$rs1)), (TH_MULSH GPR:$rd, GPR:$rs1, GPR:$rs2)>; } // Predicates = [HasVendorXTHeadMac, IsRV32] +let Predicates = [HasVendorXTHeadVdot] in { defm PseudoTHVdotVMAQA : VPseudoVMAQA_VV_VX; defm PseudoTHVdotVMAQAU : VPseudoVMAQA_VV_VX; defm PseudoTHVdotVMAQASU : VPseudoVMAQA_VV_VX; defm PseudoTHVdotVMAQAUS : VPseudoVMAQA_VX; +} let Predicates = [HasVendorXTHeadVdot] in { defm : VPatTernaryVMAQA_VV_VX<"int_riscv_th_vmaqa", "PseudoTHVdotVMAQA", From 7034014d08249a1e159a668a71e96a0b78636a39 Mon Sep 17 00:00:00 2001 From: Jeffrey Byrnes Date: Wed, 11 Jun 2025 18:07:00 -0700 Subject: [PATCH 167/851] [InstCombine] Combine or-disjoint (and->mul), (and->mul) to and->mul (#136013) The canonical pattern for bitmasked mul is currently ``` %val = and %x, %bitMask // where %bitMask is some constant %cmp = icmp eq %val, 0 %sel = select %cmp, 0, %C // where %C is some constant = C' * %bitMask ``` In certain cases, where we are combining multiple of these bitmasked muls with common factors, we are able to optimize into and->mul (see https://github.com/llvm/llvm-project/pull/135274 ) This optimization lends itself to further optimizations. This PR addresses one of such optimizations. In cases where we have `or-disjoint ( mul(and (X, C1), D) , mul (and (X, C2), D))` we can combine into `mul( and (X, (C1 + C2)), D) ` provided C1 and C2 are disjoint. Generalized proof: https://alive2.llvm.org/ce/z/MQYMui --- .../InstCombine/InstCombineAndOrXor.cpp | 130 ++++++++++++------ .../test/Transforms/InstCombine/or-bitmask.ll | 116 ++++++++++++++-- 2 files changed, 190 insertions(+), 56 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index c6c231f81c4ab..dce695a036006 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -3592,6 +3592,73 @@ static Value *foldOrOfInversions(BinaryOperator &I, return nullptr; } +// A decomposition of ((X & Mask) * Factor). The NUW / NSW bools +// track these properities for preservation. Note that we can decompose +// equivalent select form of this expression (e.g. (!(X & Mask) ? 0 : Mask * +// Factor)) +struct DecomposedBitMaskMul { + Value *X; + APInt Factor; + APInt Mask; + bool NUW; + bool NSW; +}; + +static std::optional matchBitmaskMul(Value *V) { + Instruction *Op = dyn_cast(V); + if (!Op) + return std::nullopt; + + // Decompose (A & N) * C) into BitMaskMul + Value *Original = nullptr; + const APInt *Mask = nullptr; + const APInt *MulConst = nullptr; + if (match(Op, m_Mul(m_And(m_Value(Original), m_APInt(Mask)), + m_APInt(MulConst)))) { + if (MulConst->isZero() || Mask->isZero()) + return std::nullopt; + + return std::optional( + {Original, *MulConst, *Mask, + cast(Op)->hasNoUnsignedWrap(), + cast(Op)->hasNoSignedWrap()}); + } + + Value *Cond = nullptr; + const APInt *EqZero = nullptr, *NeZero = nullptr; + + // Decompose ((A & N) ? 0 : N * C) into BitMaskMul + if (match(Op, m_Select(m_Value(Cond), m_APInt(EqZero), m_APInt(NeZero)))) { + auto ICmpDecompose = + decomposeBitTest(Cond, /*LookThruTrunc=*/true, + /*AllowNonZeroC=*/false, /*DecomposeBitMask=*/true); + if (!ICmpDecompose.has_value()) + return std::nullopt; + + assert(ICmpInst::isEquality(ICmpDecompose->Pred) && + ICmpDecompose->C.isZero()); + + if (ICmpDecompose->Pred == ICmpInst::ICMP_NE) + std::swap(EqZero, NeZero); + + if (!EqZero->isZero() || NeZero->isZero()) + return std::nullopt; + + if (!ICmpDecompose->Mask.isPowerOf2() || ICmpDecompose->Mask.isZero() || + NeZero->getBitWidth() != ICmpDecompose->Mask.getBitWidth()) + return std::nullopt; + + if (!NeZero->urem(ICmpDecompose->Mask).isZero()) + return std::nullopt; + + return std::optional( + {ICmpDecompose->X, NeZero->udiv(ICmpDecompose->Mask), + ICmpDecompose->Mask, /*NUW=*/false, /*NSW=*/false}); + } + + return std::nullopt; +} + // FIXME: We use commutative matchers (m_c_*) for some, but not all, matches // here. We should standardize that construct where it is needed or choose some // other way to ensure that commutated variants of patterns are not missed. @@ -3674,49 +3741,26 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) { /*NSW=*/true, /*NUW=*/true)) return R; - Value *Cond0 = nullptr, *Cond1 = nullptr; - const APInt *Op0Eq = nullptr, *Op0Ne = nullptr; - const APInt *Op1Eq = nullptr, *Op1Ne = nullptr; - - // (!(A & N) ? 0 : N * C) + (!(A & M) ? 0 : M * C) -> A & (N + M) * C - if (match(I.getOperand(0), - m_Select(m_Value(Cond0), m_APInt(Op0Eq), m_APInt(Op0Ne))) && - match(I.getOperand(1), - m_Select(m_Value(Cond1), m_APInt(Op1Eq), m_APInt(Op1Ne)))) { - - auto LHSDecompose = - decomposeBitTest(Cond0, /*LookThruTrunc=*/true, - /*AllowNonZeroC=*/false, /*DecomposeAnd=*/true); - auto RHSDecompose = - decomposeBitTest(Cond1, /*LookThruTrunc=*/true, - /*AllowNonZeroC=*/false, /*DecomposeAnd=*/true); - - if (LHSDecompose && RHSDecompose && LHSDecompose->X == RHSDecompose->X && - RHSDecompose->Mask.isPowerOf2() && LHSDecompose->Mask.isPowerOf2() && - LHSDecompose->Mask != RHSDecompose->Mask && - LHSDecompose->Mask.getBitWidth() == Op0Ne->getBitWidth() && - RHSDecompose->Mask.getBitWidth() == Op1Ne->getBitWidth()) { - assert(Op0Ne->getBitWidth() == Op1Ne->getBitWidth()); - assert(ICmpInst::isEquality(LHSDecompose->Pred)); - if (LHSDecompose->Pred == ICmpInst::ICMP_NE) - std::swap(Op0Eq, Op0Ne); - if (RHSDecompose->Pred == ICmpInst::ICMP_NE) - std::swap(Op1Eq, Op1Ne); - - if (!Op0Ne->isZero() && !Op1Ne->isZero() && Op0Eq->isZero() && - Op1Eq->isZero() && Op0Ne->urem(LHSDecompose->Mask).isZero() && - Op1Ne->urem(RHSDecompose->Mask).isZero() && - Op0Ne->udiv(LHSDecompose->Mask) == - Op1Ne->udiv(RHSDecompose->Mask)) { - auto NewAnd = Builder.CreateAnd( - LHSDecompose->X, - ConstantInt::get(LHSDecompose->X->getType(), - (LHSDecompose->Mask + RHSDecompose->Mask))); - - return BinaryOperator::CreateMul( - NewAnd, ConstantInt::get(NewAnd->getType(), - Op0Ne->udiv(LHSDecompose->Mask))); - } + // (A & N) * C + (A & M) * C -> (A & (N + M)) & C + // This also accepts the equivalent select form of (A & N) * C + // expressions i.e. !(A & N) ? 0 : N * C) + auto Decomp1 = matchBitmaskMul(I.getOperand(1)); + if (Decomp1) { + auto Decomp0 = matchBitmaskMul(I.getOperand(0)); + if (Decomp0 && Decomp0->X == Decomp1->X && + (Decomp0->Mask & Decomp1->Mask).isZero() && + Decomp0->Factor == Decomp1->Factor) { + + Value *NewAnd = Builder.CreateAnd( + Decomp0->X, ConstantInt::get(Decomp0->X->getType(), + (Decomp0->Mask + Decomp1->Mask))); + + auto *Combined = BinaryOperator::CreateMul( + NewAnd, ConstantInt::get(NewAnd->getType(), Decomp1->Factor)); + + Combined->setHasNoUnsignedWrap(Decomp0->NUW && Decomp1->NUW); + Combined->setHasNoSignedWrap(Decomp0->NSW && Decomp1->NSW); + return Combined; } } } diff --git a/llvm/test/Transforms/InstCombine/or-bitmask.ll b/llvm/test/Transforms/InstCombine/or-bitmask.ll index 3b482dc1794db..3c992dfea569a 100644 --- a/llvm/test/Transforms/InstCombine/or-bitmask.ll +++ b/llvm/test/Transforms/InstCombine/or-bitmask.ll @@ -36,13 +36,9 @@ define i32 @add_select_cmp_and2(i32 %in) { define i32 @add_select_cmp_and3(i32 %in) { ; CHECK-LABEL: @add_select_cmp_and3( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 3 -; CHECK-NEXT: [[TEMP:%.*]] = mul nuw nsw i32 [[TMP1]], 72 -; CHECK-NEXT: [[BITOP2:%.*]] = and i32 [[IN]], 4 -; CHECK-NEXT: [[CMP2:%.*]] = icmp eq i32 [[BITOP2]], 0 -; CHECK-NEXT: [[SEL2:%.*]] = select i1 [[CMP2]], i32 0, i32 288 -; CHECK-NEXT: [[OUT:%.*]] = or disjoint i32 [[TEMP]], [[SEL2]] -; CHECK-NEXT: ret i32 [[OUT]] +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 7 +; CHECK-NEXT: [[TEMP1:%.*]] = mul nuw nsw i32 [[TMP1]], 72 +; CHECK-NEXT: ret i32 [[TEMP1]] ; %bitop0 = and i32 %in, 1 %cmp0 = icmp eq i32 %bitop0, 0 @@ -60,12 +56,9 @@ define i32 @add_select_cmp_and3(i32 %in) { define i32 @add_select_cmp_and4(i32 %in) { ; CHECK-LABEL: @add_select_cmp_and4( -; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 3 -; CHECK-NEXT: [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72 -; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IN]], 12 -; CHECK-NEXT: [[TEMP3:%.*]] = mul nuw nsw i32 [[TMP2]], 72 -; CHECK-NEXT: [[OUT1:%.*]] = or disjoint i32 [[OUT]], [[TEMP3]] -; CHECK-NEXT: ret i32 [[OUT1]] +; CHECK-NEXT: [[TMP2:%.*]] = and i32 [[IN:%.*]], 15 +; CHECK-NEXT: [[TEMP2:%.*]] = mul nuw nsw i32 [[TMP2]], 72 +; CHECK-NEXT: ret i32 [[TEMP2]] ; %bitop0 = and i32 %in, 1 %cmp0 = icmp eq i32 %bitop0, 0 @@ -361,6 +354,103 @@ define i64 @mask_select_types_1(i64 %in) { ret i64 %out } +define i32 @add_select_cmp_mixed1(i32 %in) { +; CHECK-LABEL: @add_select_cmp_mixed1( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 3 +; CHECK-NEXT: [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72 +; CHECK-NEXT: ret i32 [[OUT]] +; + %mask = and i32 %in, 1 + %sel0 = mul i32 %mask, 72 + %bitop1 = and i32 %in, 2 + %cmp1 = icmp eq i32 %bitop1, 0 + %sel1 = select i1 %cmp1, i32 0, i32 144 + %out = or disjoint i32 %sel0, %sel1 + ret i32 %out +} + +define i32 @add_select_cmp_mixed2(i32 %in) { +; CHECK-LABEL: @add_select_cmp_mixed2( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 3 +; CHECK-NEXT: [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72 +; CHECK-NEXT: ret i32 [[OUT]] +; + %bitop0 = and i32 %in, 1 + %cmp0 = icmp eq i32 %bitop0, 0 + %mask = and i32 %in, 2 + %sel0 = select i1 %cmp0, i32 0, i32 72 + %sel1 = mul i32 %mask, 72 + %out = or disjoint i32 %sel0, %sel1 + ret i32 %out +} + +define i32 @add_select_cmp_and_mul(i32 %in) { +; CHECK-LABEL: @add_select_cmp_and_mul( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 3 +; CHECK-NEXT: [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72 +; CHECK-NEXT: ret i32 [[OUT]] +; + %mask0 = and i32 %in, 1 + %sel0 = mul i32 %mask0, 72 + %mask1 = and i32 %in, 2 + %sel1 = mul i32 %mask1, 72 + %out = or disjoint i32 %sel0, %sel1 + ret i32 %out +} + +define i32 @add_select_cmp_mixed2_mismatch(i32 %in) { +; CHECK-LABEL: @add_select_cmp_mixed2_mismatch( +; CHECK-NEXT: [[BITOP0:%.*]] = and i32 [[IN:%.*]], 1 +; CHECK-NEXT: [[CMP0:%.*]] = icmp eq i32 [[BITOP0]], 0 +; CHECK-NEXT: [[MASK:%.*]] = and i32 [[IN]], 2 +; CHECK-NEXT: [[SEL0:%.*]] = select i1 [[CMP0]], i32 0, i32 73 +; CHECK-NEXT: [[SEL1:%.*]] = mul nuw nsw i32 [[MASK]], 72 +; CHECK-NEXT: [[OUT:%.*]] = or disjoint i32 [[SEL0]], [[SEL1]] +; CHECK-NEXT: ret i32 [[OUT]] +; + %bitop0 = and i32 %in, 1 + %cmp0 = icmp eq i32 %bitop0, 0 + %mask = and i32 %in, 2 + %sel0 = select i1 %cmp0, i32 0, i32 73 + %sel1 = mul i32 %mask, 72 + %out = or disjoint i32 %sel0, %sel1 + ret i32 %out +} + +define i32 @add_select_cmp_and_mul_mismatch(i32 %in) { +; CHECK-LABEL: @add_select_cmp_and_mul_mismatch( +; CHECK-NEXT: [[TMP1:%.*]] = trunc i32 [[IN:%.*]] to i1 +; CHECK-NEXT: [[SEL0:%.*]] = select i1 [[TMP1]], i32 73, i32 0 +; CHECK-NEXT: [[MASK1:%.*]] = and i32 [[IN]], 2 +; CHECK-NEXT: [[SEL1:%.*]] = mul nuw nsw i32 [[MASK1]], 72 +; CHECK-NEXT: [[OUT:%.*]] = or disjoint i32 [[SEL0]], [[SEL1]] +; CHECK-NEXT: ret i32 [[OUT]] +; + %mask0 = and i32 %in, 1 + %sel0 = mul i32 %mask0, 73 + %mask1 = and i32 %in, 2 + %sel1 = mul i32 %mask1, 72 + %out = or disjoint i32 %sel0, %sel1 + ret i32 %out +} + +define i32 @and_mul_non_disjoint(i32 %in) { +; CHECK-LABEL: @and_mul_non_disjoint( +; CHECK-NEXT: [[TMP1:%.*]] = and i32 [[IN:%.*]], 2 +; CHECK-NEXT: [[OUT:%.*]] = mul nuw nsw i32 [[TMP1]], 72 +; CHECK-NEXT: [[MASK1:%.*]] = and i32 [[IN]], 4 +; CHECK-NEXT: [[SEL1:%.*]] = mul nuw nsw i32 [[MASK1]], 72 +; CHECK-NEXT: [[OUT1:%.*]] = or i32 [[OUT]], [[SEL1]] +; CHECK-NEXT: ret i32 [[OUT1]] +; + %mask0 = and i32 %in, 2 + %sel0 = mul i32 %mask0, 72 + %mask1 = and i32 %in, 4 + %sel1 = mul i32 %mask1, 72 + %out = or i32 %sel0, %sel1 + ret i32 %out +} + ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CONSTSPLAT: {{.*}} ; CONSTVEC: {{.*}} From c4316180418ce8de4b4c9812c7fac791d55b6102 Mon Sep 17 00:00:00 2001 From: Shunsuke Watanabe Date: Thu, 12 Jun 2025 10:19:26 +0900 Subject: [PATCH 168/851] [Clang][Driver] Override complex number calculation method by -fno-fast-math (#132680) This patch fixes a bug where -fno-fast-math doesn't revert the complex number calculation method to the default. The priority of overriding options related to complex number calculations differs slightly from GCC, as discussed in: https://discourse.llvm.org/t/the-priority-of-fno-fast-math-regarding-complex-number-calculations/84679 --- clang/lib/Driver/ToolChains/Clang.cpp | 22 +++++- clang/test/Driver/range.c | 100 +++++++++++++++++++++++--- 2 files changed, 112 insertions(+), 10 deletions(-) diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp index a74fa81f3cf5b..1d11be1d82be8 100644 --- a/clang/lib/Driver/ToolChains/Clang.cpp +++ b/clang/lib/Driver/ToolChains/Clang.cpp @@ -2831,8 +2831,9 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, StringRef Float16ExcessPrecision = ""; StringRef BFloat16ExcessPrecision = ""; LangOptions::ComplexRangeKind Range = LangOptions::ComplexRangeKind::CX_None; - std::string ComplexRangeStr = ""; - std::string GccRangeComplexOption = ""; + std::string ComplexRangeStr; + std::string GccRangeComplexOption; + std::string LastComplexRangeOption; auto setComplexRange = [&](LangOptions::ComplexRangeKind NewRange) { // Warn if user expects to perform full implementation of complex @@ -2916,6 +2917,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, EmitComplexRangeDiag(D, GccRangeComplexOption, "-fcx-limited-range"); } GccRangeComplexOption = "-fcx-limited-range"; + LastComplexRangeOption = A->getSpelling(); Range = LangOptions::ComplexRangeKind::CX_Basic; break; case options::OPT_fno_cx_limited_range: @@ -2929,6 +2931,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, "-fno-cx-limited-range"); } GccRangeComplexOption = "-fno-cx-limited-range"; + LastComplexRangeOption = A->getSpelling(); Range = LangOptions::ComplexRangeKind::CX_Full; break; case options::OPT_fcx_fortran_rules: @@ -2938,6 +2941,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, else EmitComplexRangeDiag(D, GccRangeComplexOption, "-fcx-fortran-rules"); GccRangeComplexOption = "-fcx-fortran-rules"; + LastComplexRangeOption = A->getSpelling(); Range = LangOptions::ComplexRangeKind::CX_Improved; break; case options::OPT_fno_cx_fortran_rules: @@ -2950,6 +2954,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, "-fno-cx-fortran-rules"); } GccRangeComplexOption = "-fno-cx-fortran-rules"; + LastComplexRangeOption = A->getSpelling(); Range = LangOptions::ComplexRangeKind::CX_Full; break; case options::OPT_fcomplex_arithmetic_EQ: { @@ -2984,6 +2989,8 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, ComplexArithmeticStr(RangeVal)); } } + LastComplexRangeOption = + Args.MakeArgString(A->getSpelling() + A->getValue()); Range = RangeVal; break; } @@ -3037,6 +3044,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, } else D.Diag(diag::err_drv_unsupported_option_argument) << A->getSpelling() << Val; + LastComplexRangeOption = A->getSpelling(); break; } @@ -3222,6 +3230,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, [[fallthrough]]; case options::OPT_ffast_math: applyFastMath(true); + LastComplexRangeOption = A->getSpelling(); if (A->getOption().getID() == options::OPT_Ofast) LastFpContractOverrideOption = "-Ofast"; else @@ -3239,6 +3248,15 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D, ApproxFunc = false; SignedZeros = true; restoreFPContractState(); + // If the last specified option related to complex range is not + // -ffast-math or -ffp-model=, emit warning. + if (LastComplexRangeOption != "-ffast-math" && + LastComplexRangeOption != "-ffp-model=" && + Range != LangOptions::ComplexRangeKind::CX_Full) + EmitComplexRangeDiag(D, LastComplexRangeOption, "-fno-fast-math"); + Range = LangOptions::ComplexRangeKind::CX_None; + LastComplexRangeOption = ""; + GccRangeComplexOption = ""; LastFpContractOverrideOption = ""; break; } // End switch (A->getOption().getID()) diff --git a/clang/test/Driver/range.c b/clang/test/Driver/range.c index da5748d7c723c..30140f3c208e0 100644 --- a/clang/test/Driver/range.c +++ b/clang/test/Driver/range.c @@ -177,14 +177,83 @@ // RUN: %clang -### -target x86_64 -ffast-math -fcomplex-arithmetic=basic -c %s 2>&1 \ // RUN: | FileCheck --check-prefix=BASIC %s -// BASIC: -complex-range=basic -// FULL: -complex-range=full -// PRMTD: -complex-range=promoted -// BASIC-NOT: -complex-range=improved -// CHECK-NOT: -complex-range=basic -// IMPRVD: -complex-range=improved -// IMPRVD-NOT: -complex-range=basic -// CHECK-NOT: -complex-range=improved +// RUN: %clang -### --target=x86_64 -fcx-limited-range -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN21 %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-cx-limited-range -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### --target=x86_64 -fcx-fortran-rules -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN22 %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-cx-fortran-rules -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### -Werror --target=x86_64 -ffast-math -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### --target=x86_64 -fcomplex-arithmetic=basic -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN23 %s + +// RUN: %clang -### --target=x86_64 -fcomplex-arithmetic=promoted -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN24 %s + +// RUN: %clang -### --target=x86_64 -fcomplex-arithmetic=improved -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE,WARN25 %s + +// RUN: %clang -### -Werror --target=x86_64 -fcomplex-arithmetic=full -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### -Werror --target=x86_64 -ffp-model=aggressive -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### -Werror --target=x86_64 -ffp-model=fast -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### -Werror --target=x86_64 -ffp-model=precise -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### -Werror --target=x86_64 -ffp-model=strict -fno-fast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=RANGE %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcx-limited-range \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fno-cx-limited-range \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=FULL %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcx-fortran-rules \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=IMPRVD %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fno-cx-fortran-rules \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=FULL %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffast-math \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=basic \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=promoted \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=PRMTD %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=improved \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=IMPRVD %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -fcomplex-arithmetic=full \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=FULL %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=aggressive \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=BASIC %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=fast \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=PRMTD %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=precise \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=FULL %s + +// RUN: %clang -### -Werror --target=x86_64 -fno-fast-math -ffp-model=strict \ +// RUN: -c %s 2>&1 | FileCheck --check-prefixes=FULL %s // WARN1: warning: overriding '-fcx-limited-range' option with '-fcx-fortran-rules' [-Woverriding-option] // WARN2: warning: overriding '-fno-cx-limited-range' option with '-fcx-fortran-rules' [-Woverriding-option] @@ -196,5 +265,20 @@ // WARN14: overriding '-complex-range=promoted' option with '-fcx-limited-range' [-Woverriding-option] // WARN17: warning: overriding '-fcomplex-arithmetic=full' option with '-fcomplex-arithmetic=basic' [-Woverriding-option] // WARN20: warning: overriding '-fcx-fortran-rules' option with '-fcx-limited-range' [-Woverriding-option] +// WARN21: warning: overriding '-fcx-limited-range' option with '-fno-fast-math' [-Woverriding-option] +// WARN22: warning: overriding '-fcx-fortran-rules' option with '-fno-fast-math' [-Woverriding-option] +// WARN23: warning: overriding '-fcomplex-arithmetic=basic' option with '-fno-fast-math' [-Woverriding-option] +// WARN24: warning: overriding '-fcomplex-arithmetic=promoted' option with '-fno-fast-math' [-Woverriding-option] +// WARN25: warning: overriding '-fcomplex-arithmetic=improved' option with '-fno-fast-math' [-Woverriding-option] + +// BASIC: -complex-range=basic +// FULL: -complex-range=full +// PRMTD: -complex-range=promoted +// BASIC-NOT: -complex-range=improved +// CHECK-NOT: -complex-range=basic +// IMPRVD: -complex-range=improved +// IMPRVD-NOT: -complex-range=basic +// CHECK-NOT: -complex-range=improved +// RANGE-NOT: -complex-range= // ERR: error: unsupported argument 'foo' to option '-fcomplex-arithmetic=' From 52360d195b85608c677d781272534dfa61e9a1c3 Mon Sep 17 00:00:00 2001 From: Longsheng Mou Date: Thu, 12 Jun 2025 09:27:27 +0800 Subject: [PATCH 169/851] [NFC] Use `llvm::includes` instead of `std::includes` (#143542) This PR follows up #143297. --- clang-tools-extra/clangd/refactor/Rename.cpp | 2 +- llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp | 6 ++---- llvm/tools/sancov/sancov.cpp | 3 +-- llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp | 4 ++-- llvm/unittests/ADT/DeltaAlgorithmTest.cpp | 4 ++-- llvm/utils/TableGen/AsmMatcherEmitter.cpp | 3 +-- llvm/utils/TableGen/Common/CodeGenRegisters.cpp | 7 ++----- 7 files changed, 11 insertions(+), 18 deletions(-) diff --git a/clang-tools-extra/clangd/refactor/Rename.cpp b/clang-tools-extra/clangd/refactor/Rename.cpp index d9b73b83e902a..c56375b1a98d3 100644 --- a/clang-tools-extra/clangd/refactor/Rename.cpp +++ b/clang-tools-extra/clangd/refactor/Rename.cpp @@ -1308,7 +1308,7 @@ getMappedRanges(ArrayRef Indexed, ArrayRef Lexed) { return std::nullopt; } // Fast check for the special subset case. - if (std::includes(Indexed.begin(), Indexed.end(), Lexed.begin(), Lexed.end())) + if (llvm::includes(Indexed, Lexed)) return Lexed.vec(); std::vector Best; diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index d94a2fbb23d23..61fef1387d82a 100644 --- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -1975,12 +1975,10 @@ Value *DFSanFunction::combineShadows(Value *V1, Value *V2, auto V1Elems = ShadowElements.find(V1); auto V2Elems = ShadowElements.find(V2); if (V1Elems != ShadowElements.end() && V2Elems != ShadowElements.end()) { - if (std::includes(V1Elems->second.begin(), V1Elems->second.end(), - V2Elems->second.begin(), V2Elems->second.end())) { + if (llvm::includes(V1Elems->second, V2Elems->second)) { return collapseToPrimitiveShadow(V1, Pos); } - if (std::includes(V2Elems->second.begin(), V2Elems->second.end(), - V1Elems->second.begin(), V1Elems->second.end())) { + if (llvm::includes(V2Elems->second, V1Elems->second)) { return collapseToPrimitiveShadow(V2, Pos); } } else if (V1Elems != ShadowElements.end()) { diff --git a/llvm/tools/sancov/sancov.cpp b/llvm/tools/sancov/sancov.cpp index 2cc84b47de6b9..aebb5effd0be7 100644 --- a/llvm/tools/sancov/sancov.cpp +++ b/llvm/tools/sancov/sancov.cpp @@ -889,8 +889,7 @@ symbolize(const RawCoverage &Data, const std::string ObjectFile) { } std::set AllAddrs = findCoveragePointAddrs(ObjectFile); - if (!std::includes(AllAddrs.begin(), AllAddrs.end(), Data.Addrs->begin(), - Data.Addrs->end())) { + if (!llvm::includes(AllAddrs, *Data.Addrs)) { fail("Coverage points in binary and .sancov file do not match."); } Coverage->Points = getCoveragePoints(ObjectFile, AllAddrs, *Data.Addrs); diff --git a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp index 66a67d96d1532..f543947899393 100644 --- a/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp +++ b/llvm/unittests/ADT/DAGDeltaAlgorithmTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/DAGDeltaAlgorithm.h" +#include "llvm/ADT/STLExtras.h" #include "gtest/gtest.h" #include #include @@ -23,8 +24,7 @@ class FixedDAGDeltaAlgorithm : public DAGDeltaAlgorithm { protected: bool ExecuteOneTest(const changeset_ty &Changes) override { ++NumTests; - return std::includes(Changes.begin(), Changes.end(), - FailingSet.begin(), FailingSet.end()); + return llvm::includes(Changes, FailingSet); } public: diff --git a/llvm/unittests/ADT/DeltaAlgorithmTest.cpp b/llvm/unittests/ADT/DeltaAlgorithmTest.cpp index 5e284129180a0..24e18f42eb33c 100644 --- a/llvm/unittests/ADT/DeltaAlgorithmTest.cpp +++ b/llvm/unittests/ADT/DeltaAlgorithmTest.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "llvm/ADT/DeltaAlgorithm.h" +#include "llvm/ADT/STLExtras.h" #include "gtest/gtest.h" #include #include @@ -38,8 +39,7 @@ class FixedDeltaAlgorithm final : public DeltaAlgorithm { protected: bool ExecuteOneTest(const changeset_ty &Changes) override { ++NumTests; - return std::includes(Changes.begin(), Changes.end(), - FailingSet.begin(), FailingSet.end()); + return llvm::includes(Changes, FailingSet); } public: diff --git a/llvm/utils/TableGen/AsmMatcherEmitter.cpp b/llvm/utils/TableGen/AsmMatcherEmitter.cpp index 9792eb41ea5d7..32098e96ce721 100644 --- a/llvm/utils/TableGen/AsmMatcherEmitter.cpp +++ b/llvm/utils/TableGen/AsmMatcherEmitter.cpp @@ -1330,8 +1330,7 @@ void AsmMatcherInfo::buildRegisterClasses( for (const RegisterSet &RS : RegisterSets) { ClassInfo *CI = RegisterSetClasses[RS]; for (const RegisterSet &RS2 : RegisterSets) - if (RS != RS2 && std::includes(RS2.begin(), RS2.end(), RS.begin(), - RS.end(), LessRecordByID())) + if (RS != RS2 && llvm::includes(RS2, RS, LessRecordByID())) CI->SuperClasses.push_back(RegisterSetClasses[RS2]); } diff --git a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp index 4d24eb3de1ed9..f52c21e97f9c8 100644 --- a/llvm/utils/TableGen/Common/CodeGenRegisters.cpp +++ b/llvm/utils/TableGen/Common/CodeGenRegisters.cpp @@ -933,9 +933,7 @@ bool CodeGenRegisterClass::Key::operator<( static bool testSubClass(const CodeGenRegisterClass *A, const CodeGenRegisterClass *B) { return A->RSI.isSubClassOf(B->RSI) && - std::includes(A->getMembers().begin(), A->getMembers().end(), - B->getMembers().begin(), B->getMembers().end(), - deref>()); + llvm::includes(A->getMembers(), B->getMembers(), deref>()); } /// Sorting predicate for register classes. This provides a topological @@ -1990,8 +1988,7 @@ findRegUnitSet(const std::vector &UniqueSets, // Return true if the RUSubSet is a subset of RUSuperSet. static bool isRegUnitSubSet(const std::vector &RUSubSet, const std::vector &RUSuperSet) { - return std::includes(RUSuperSet.begin(), RUSuperSet.end(), RUSubSet.begin(), - RUSubSet.end()); + return llvm::includes(RUSuperSet, RUSubSet); } /// Iteratively prune unit sets. Prune subsets that are close to the superset, From 082251bba4effea7f60191c6cbddacb3705c07db Mon Sep 17 00:00:00 2001 From: Jameson Nash Date: Wed, 11 Jun 2025 21:49:01 -0400 Subject: [PATCH 170/851] [AArch64] fix trampoline implementation: use X15 (#126743) AAPCS64 reserves any of X9-X15 for a compiler to choose to use for this purpose, and says not to use X16 or X18 like GCC (and the previous implementation) chose to use. The X18 register may need to get used by the kernel in some circumstances, as specified by the platform ABI, so it is generally an unwise choice. Simply choosing a different register fixes the problem of this being broken on any platform that actually follows the platform ABI (which is all of them except EABI, if I am reading this linux kernel bug correctly https://lkml2.uits.iu.edu/hypermail/linux/kernel/2001.2/01502.html). As a side benefit, also generate slightly better code and avoids needing the compiler-rt to be present. I did that by following the XCore implementation instead of PPC (although in hindsight, following the RISCV might have been slightly more readable). That X18 is wrong to use for this purpose has been known for many years (e.g. https://www.mail-archive.com/gcc@gcc.gnu.org/msg76934.html) and also known that fixing this to use one of the correct registers is not an ABI break, since this only appears inside of a translation unit. Some of the other temporary registers (e.g. X9) are already reserved inside llvm for internal use as a generic temporary register in the prologue before saving registers, while X15 was already used in rare cases as a scratch register in the prologue as well, so I felt that seemed the most logical choice to choose here. --- compiler-rt/lib/builtins/README.txt | 5 - compiler-rt/lib/builtins/trampoline_setup.c | 42 --- .../builtins/Unit/trampoline_setup_test.c | 2 +- .../lib/Optimizer/CodeGen/BoxedProcedure.cpp | 8 +- flang/test/Fir/boxproc.fir | 4 +- .../AArch64/AArch64CallingConvention.td | 25 +- .../Target/AArch64/AArch64FrameLowering.cpp | 85 ++++-- .../Target/AArch64/AArch64ISelLowering.cpp | 97 ++++--- llvm/lib/TargetParser/Triple.cpp | 2 - llvm/test/CodeGen/AArch64/nest-register.ll | 16 +- .../AArch64/statepoint-call-lowering.ll | 2 +- llvm/test/CodeGen/AArch64/trampoline.ll | 257 +++++++++++++++++- llvm/test/CodeGen/AArch64/win64cc-x18.ll | 27 +- .../CodeGen/AArch64/zero-call-used-regs.ll | 16 +- 14 files changed, 421 insertions(+), 167 deletions(-) diff --git a/compiler-rt/lib/builtins/README.txt b/compiler-rt/lib/builtins/README.txt index 19f26c92a0f94..2d213d95f333a 100644 --- a/compiler-rt/lib/builtins/README.txt +++ b/compiler-rt/lib/builtins/README.txt @@ -272,11 +272,6 @@ switch32 switch8 switchu8 -// This function generates a custom trampoline function with the specific -// realFunc and localsPtr values. -void __trampoline_setup(uint32_t* trampOnStack, int trampSizeAllocated, - const void* realFunc, void* localsPtr); - // There is no C interface to the *_vfp_d8_d15_regs functions. There are // called in the prolog and epilog of Thumb1 functions. When the C++ ABI use // SJLJ for exceptions, each function with a catch clause or destructors needs diff --git a/compiler-rt/lib/builtins/trampoline_setup.c b/compiler-rt/lib/builtins/trampoline_setup.c index 830e25e4c0303..844eb27944142 100644 --- a/compiler-rt/lib/builtins/trampoline_setup.c +++ b/compiler-rt/lib/builtins/trampoline_setup.c @@ -41,45 +41,3 @@ COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack, __clear_cache(trampOnStack, &trampOnStack[10]); } #endif // __powerpc__ && !defined(__powerpc64__) - -// The AArch64 compiler generates calls to __trampoline_setup() when creating -// trampoline functions on the stack for use with nested functions. -// This function creates a custom 36-byte trampoline function on the stack -// which loads x18 with a pointer to the outer function's locals -// and then jumps to the target nested function. -// Note: x18 is a reserved platform register on Windows and macOS. - -#if defined(__aarch64__) && defined(__ELF__) -COMPILER_RT_ABI void __trampoline_setup(uint32_t *trampOnStack, - int trampSizeAllocated, - const void *realFunc, void *localsPtr) { - // This should never happen, but if compiler did not allocate - // enough space on stack for the trampoline, abort. - if (trampSizeAllocated < 36) - compilerrt_abort(); - - // create trampoline - // Load realFunc into x17. mov/movk 16 bits at a time. - trampOnStack[0] = - 0xd2800000u | ((((uint64_t)realFunc >> 0) & 0xffffu) << 5) | 0x11; - trampOnStack[1] = - 0xf2a00000u | ((((uint64_t)realFunc >> 16) & 0xffffu) << 5) | 0x11; - trampOnStack[2] = - 0xf2c00000u | ((((uint64_t)realFunc >> 32) & 0xffffu) << 5) | 0x11; - trampOnStack[3] = - 0xf2e00000u | ((((uint64_t)realFunc >> 48) & 0xffffu) << 5) | 0x11; - // Load localsPtr into x18 - trampOnStack[4] = - 0xd2800000u | ((((uint64_t)localsPtr >> 0) & 0xffffu) << 5) | 0x12; - trampOnStack[5] = - 0xf2a00000u | ((((uint64_t)localsPtr >> 16) & 0xffffu) << 5) | 0x12; - trampOnStack[6] = - 0xf2c00000u | ((((uint64_t)localsPtr >> 32) & 0xffffu) << 5) | 0x12; - trampOnStack[7] = - 0xf2e00000u | ((((uint64_t)localsPtr >> 48) & 0xffffu) << 5) | 0x12; - trampOnStack[8] = 0xd61f0220; // br x17 - - // Clear instruction cache. - __clear_cache(trampOnStack, &trampOnStack[9]); -} -#endif // defined(__aarch64__) && !defined(__APPLE__) && !defined(_WIN64) diff --git a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c index d51d35acaa02f..da115fe764271 100644 --- a/compiler-rt/test/builtins/Unit/trampoline_setup_test.c +++ b/compiler-rt/test/builtins/Unit/trampoline_setup_test.c @@ -7,7 +7,7 @@ /* * Tests nested functions - * The ppc and aarch64 compilers generates a call to __trampoline_setup + * The ppc compiler generates a call to __trampoline_setup * The i386 and x86_64 compilers generate a call to ___enable_execute_stack */ diff --git a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp index 82b11ad7db32a..69bdb48146a54 100644 --- a/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp +++ b/flang/lib/Optimizer/CodeGen/BoxedProcedure.cpp @@ -274,12 +274,12 @@ class BoxedProcedurePass auto loc = embox.getLoc(); mlir::Type i8Ty = builder.getI8Type(); mlir::Type i8Ptr = builder.getRefType(i8Ty); - // For AArch64, PPC32 and PPC64, the thunk is populated by a call to + // For PPC32 and PPC64, the thunk is populated by a call to // __trampoline_setup, which is defined in // compiler-rt/lib/builtins/trampoline_setup.c and requires the - // thunk size greater than 32 bytes. For RISCV and x86_64, the - // thunk setup doesn't go through __trampoline_setup and fits in 32 - // bytes. + // thunk size greater than 32 bytes. For AArch64, RISCV and x86_64, + // the thunk setup doesn't go through __trampoline_setup and fits in + // 32 bytes. fir::SequenceType::Extent thunkSize = triple.getTrampolineSize(); mlir::Type buffTy = SequenceType::get({thunkSize}, i8Ty); auto buffer = builder.create(loc, buffTy); diff --git a/flang/test/Fir/boxproc.fir b/flang/test/Fir/boxproc.fir index 5d82522055adc..97d9b38ed6f40 100644 --- a/flang/test/Fir/boxproc.fir +++ b/flang/test/Fir/boxproc.fir @@ -3,7 +3,7 @@ // RUN: %if powerpc-registered-target %{tco --target=powerpc64le-unknown-linux-gnu %s | FileCheck %s --check-prefixes=CHECK,CHECK-PPC %} // CHECK-LABEL: define void @_QPtest_proc_dummy() -// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [36 x i8], i64 1, align 1 +// CHECK-AARCH64: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1 // CHECK-X86: %[[VAL_3:.*]] = alloca [32 x i8], i64 1, align 1 // CHECK-PPC: %[[VAL_3:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1 // CHECK: %[[VAL_1:.*]] = alloca { ptr }, i64 1, align 8 @@ -63,7 +63,7 @@ func.func @_QPtest_proc_dummy_other(%arg0: !fir.boxproc<() -> ()>) { } // CHECK-LABEL: define void @_QPtest_proc_dummy_char() -// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [36 x i8], i64 1, align 1 +// CHECK-AARCH64: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1 // CHECK-X86: %[[VAL_20:.*]] = alloca [32 x i8], i64 1, align 1 // CHECK-PPC: %[[VAL_20:.*]] = alloca [4{{[0-8]+}} x i8], i64 1, align 1 // CHECK: %[[VAL_2:.*]] = alloca { { ptr, i64 } }, i64 1, align 8 diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td index 920cc67273146..1b5a713bffdc9 100644 --- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td +++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td @@ -28,6 +28,12 @@ class CCIfSubtarget //===----------------------------------------------------------------------===// defvar AArch64_Common = [ + // The 'nest' parameter, if any, is passed in X15. + // The previous register used here (X18) is also defined to be unavailable + // for this purpose, while all of X9-X15 were defined to be free for LLVM to + // use for this, so use X15 (which LLVM often already clobbers anyways). + CCIfNest>, + CCIfType<[iPTR], CCBitConvertToType>, CCIfType<[v2f32], CCBitConvertToType>, CCIfType<[v2f64, v4f32], CCBitConvertToType>, @@ -117,13 +123,7 @@ defvar AArch64_Common = [ ]; let Entry = 1 in -def CC_AArch64_AAPCS : CallingConv>], - AArch64_Common -)>; +def CC_AArch64_AAPCS : CallingConv; let Entry = 1 in def RetCC_AArch64_AAPCS : CallingConv<[ @@ -177,6 +177,8 @@ def CC_AArch64_Win64_VarArg : CallingConv<[ // a stack layout compatible with the x64 calling convention. let Entry = 1 in def CC_AArch64_Arm64EC_VarArg : CallingConv<[ + CCIfNest>, + // Convert small floating-point values to integer. CCIfType<[f16, bf16], CCBitConvertToType>, CCIfType<[f32], CCBitConvertToType>, @@ -353,6 +355,8 @@ def RetCC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[ // + Stack slots are sized as needed rather than being at least 64-bit. let Entry = 1 in def CC_AArch64_DarwinPCS : CallingConv<[ + CCIfNest>, + CCIfType<[iPTR], CCBitConvertToType>, CCIfType<[v2f32], CCBitConvertToType>, CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, @@ -427,6 +431,8 @@ def CC_AArch64_DarwinPCS : CallingConv<[ let Entry = 1 in def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ + CCIfNest>, + CCIfType<[iPTR], CCBitConvertToType>, CCIfType<[v2f32], CCBitConvertToType>, CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, @@ -450,6 +456,8 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[ // same as the normal Darwin VarArgs handling. let Entry = 1 in def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[ + CCIfNest>, + CCIfType<[v2f32], CCBitConvertToType>, CCIfType<[v2f64, v4f32, f128], CCBitConvertToType>, @@ -494,6 +502,8 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[ let Entry = 1 in def CC_AArch64_GHC : CallingConv<[ + CCIfNest>, + CCIfType<[iPTR], CCBitConvertToType>, // Handle all vector types as either f64 or v2f64. @@ -522,6 +532,7 @@ def CC_AArch64_Preserve_None : CallingConv<[ // We can pass arguments in all general registers, except: // - X8, used for sret + // - X15 (on Windows), used as a temporary register in the prologue when allocating call frames // - X16/X17, used by the linker as IP0/IP1 // - X18, the platform register // - X19, the base pointer diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp index 3335ee04bb0e0..2650c621e19f6 100644 --- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp @@ -331,7 +331,9 @@ static int64_t getArgumentStackToRestore(MachineFunction &MF, static bool produceCompactUnwindFrame(MachineFunction &MF); static bool needsWinCFI(const MachineFunction &MF); static StackOffset getSVEStackSize(const MachineFunction &MF); -static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB); +static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB, + bool HasCall = false); +static bool requiresSaveVG(const MachineFunction &MF); /// Returns true if a homogeneous prolog or epilog code can be emitted /// for the size optimization. If possible, a frame helper call is injected. @@ -1006,6 +1008,16 @@ void AArch64FrameLowering::emitZeroCallUsedRegs(BitVector RegsToZero, } } +static bool windowsRequiresStackProbe(const MachineFunction &MF, + uint64_t StackSizeInBytes) { + const AArch64Subtarget &Subtarget = MF.getSubtarget(); + const AArch64FunctionInfo &MFI = *MF.getInfo(); + // TODO: When implementing stack protectors, take that into account + // for the probe threshold. + return Subtarget.isTargetWindows() && MFI.hasStackProbing() && + StackSizeInBytes >= uint64_t(MFI.getStackProbeSize()); +} + static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs, const MachineBasicBlock &MBB) { const MachineFunction *MF = MBB.getParent(); @@ -1027,7 +1039,8 @@ static void getLiveRegsForEntryMBB(LivePhysRegs &LiveRegs, // but we would then have to make sure that we were in fact saving at least one // callee-save register in the prologue, which is additional complexity that // doesn't seem worth the benefit. -static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { +static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB, + bool HasCall) { MachineFunction *MF = MBB->getParent(); // If MBB is an entry block, use X9 as the scratch register @@ -1041,6 +1054,11 @@ static Register findScratchNonCalleeSaveRegister(MachineBasicBlock *MBB) { const AArch64RegisterInfo &TRI = *Subtarget.getRegisterInfo(); LivePhysRegs LiveRegs(TRI); getLiveRegsForEntryMBB(LiveRegs, *MBB); + if (HasCall) { + LiveRegs.addReg(AArch64::X16); + LiveRegs.addReg(AArch64::X17); + LiveRegs.addReg(AArch64::X18); + } // Prefer X9 since it was historically used for the prologue scratch reg. const MachineRegisterInfo &MRI = MF->getRegInfo(); @@ -1081,23 +1099,18 @@ bool AArch64FrameLowering::canUseAsPrologue( MBB.isLiveIn(AArch64::NZCV)) return false; - // Don't need a scratch register if we're not going to re-align the stack or - // emit stack probes. - if (!RegInfo->hasStackRealignment(*MF) && !TLI->hasInlineStackProbe(*MF)) - return true; - // Otherwise, we can use any block as long as it has a scratch register - // available. - return findScratchNonCalleeSaveRegister(TmpMBB) != AArch64::NoRegister; -} + if (RegInfo->hasStackRealignment(*MF) || TLI->hasInlineStackProbe(*MF)) + if (findScratchNonCalleeSaveRegister(TmpMBB) == AArch64::NoRegister) + return false; -static bool windowsRequiresStackProbe(MachineFunction &MF, - uint64_t StackSizeInBytes) { - const AArch64Subtarget &Subtarget = MF.getSubtarget(); - const AArch64FunctionInfo &MFI = *MF.getInfo(); - // TODO: When implementing stack protectors, take that into account - // for the probe threshold. - return Subtarget.isTargetWindows() && MFI.hasStackProbing() && - StackSizeInBytes >= uint64_t(MFI.getStackProbeSize()); + // May need a scratch register (for return value) if require making a special + // call + if (requiresSaveVG(*MF) || + windowsRequiresStackProbe(*MF, std::numeric_limits::max())) + if (findScratchNonCalleeSaveRegister(TmpMBB, true) == AArch64::NoRegister) + return false; + + return true; } static bool needsWinCFI(const MachineFunction &MF) { @@ -1378,8 +1391,8 @@ bool requiresGetVGCall(MachineFunction &MF) { !MF.getSubtarget().hasSVE(); } -static bool requiresSaveVG(MachineFunction &MF) { - AArch64FunctionInfo *AFI = MF.getInfo(); +static bool requiresSaveVG(const MachineFunction &MF) { + const AArch64FunctionInfo *AFI = MF.getInfo(); // For Darwin platforms we don't save VG for non-SVE functions, even if SME // is enabled with streaming mode changes. if (!AFI->hasStreamingModeChanges()) @@ -2049,6 +2062,29 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, if (AFI->getSVECalleeSavedStackSize()) report_fatal_error( "SVE callee saves not yet supported with stack probing"); + + // Find an available register to spill the value of X15 to, if X15 is being + // used already for nest. + unsigned X15Scratch = AArch64::NoRegister; + const AArch64Subtarget &STI = MF.getSubtarget(); + if (llvm::any_of(MBB.liveins(), + [&STI](const MachineBasicBlock::RegisterMaskPair &LiveIn) { + return STI.getRegisterInfo()->isSuperOrSubRegisterEq( + AArch64::X15, LiveIn.PhysReg); + })) { + X15Scratch = findScratchNonCalleeSaveRegister(&MBB, true); + assert(X15Scratch != AArch64::NoRegister && + (X15Scratch < AArch64::X15 || X15Scratch > AArch64::X17)); +#ifndef NDEBUG + LiveRegs.removeReg(AArch64::X15); // ignore X15 since we restore it +#endif + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), X15Scratch) + .addReg(AArch64::XZR) + .addReg(AArch64::X15, RegState::Undef) + .addReg(AArch64::X15, RegState::Implicit) + .setMIFlag(MachineInstr::FrameSetup); + } + uint64_t NumWords = (NumBytes + RealignmentPadding) >> 4; if (NeedsWinCFI) { HasWinCFI = true; @@ -2171,6 +2207,13 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF, // we've set a frame pointer and already finished the SEH prologue. assert(!NeedsWinCFI); } + if (X15Scratch != AArch64::NoRegister) { + BuildMI(MBB, MBBI, DL, TII->get(AArch64::ORRXrr), AArch64::X15) + .addReg(AArch64::XZR) + .addReg(X15Scratch, RegState::Undef) + .addReg(X15Scratch, RegState::Implicit) + .setMIFlag(MachineInstr::FrameSetup); + } } StackOffset SVECalleeSavesSize = {}, SVELocalsSize = SVEStackSize; @@ -3355,7 +3398,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters( unsigned X0Scratch = AArch64::NoRegister; if (Reg1 == AArch64::VG) { // Find an available register to store value of VG to. - Reg1 = findScratchNonCalleeSaveRegister(&MBB); + Reg1 = findScratchNonCalleeSaveRegister(&MBB, true); assert(Reg1 != AArch64::NoRegister); SMEAttrs Attrs = AFI->getSMEFnAttrs(); diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index 766599d567efd..ad5b90984188e 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7126,59 +7126,80 @@ static SDValue LowerFLDEXP(SDValue Op, SelectionDAG &DAG) { SDValue AArch64TargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { - // Note: x18 cannot be used for the Nest parameter on Windows and macOS. - if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) - report_fatal_error( - "ADJUST_TRAMPOLINE operation is only supported on Linux."); - return Op.getOperand(0); } SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const { - - // Note: x18 cannot be used for the Nest parameter on Windows and macOS. - if (Subtarget->isTargetDarwin() || Subtarget->isTargetWindows()) - report_fatal_error("INIT_TRAMPOLINE operation is only supported on Linux."); - SDValue Chain = Op.getOperand(0); - SDValue Trmp = Op.getOperand(1); // trampoline + SDValue Trmp = Op.getOperand(1); // trampoline, >=32 bytes SDValue FPtr = Op.getOperand(2); // nested function SDValue Nest = Op.getOperand(3); // 'nest' parameter value - SDLoc dl(Op); - EVT PtrVT = getPointerTy(DAG.getDataLayout()); - Type *IntPtrTy = DAG.getDataLayout().getIntPtrType(*DAG.getContext()); + const Value *TrmpAddr = cast(Op.getOperand(4))->getValue(); - TargetLowering::ArgListTy Args; - TargetLowering::ArgListEntry Entry; + // ldr NestReg, .+16 + // ldr x17, .+20 + // br x17 + // .word 0 + // .nest: .qword nest + // .fptr: .qword fptr + SDValue OutChains[5]; - Entry.Ty = IntPtrTy; - Entry.Node = Trmp; - Args.push_back(Entry); + const Function *Func = + cast(cast(Op.getOperand(5))->getValue()); + CallingConv::ID CC = Func->getCallingConv(); + unsigned NestReg; - if (auto *FI = dyn_cast(Trmp.getNode())) { - MachineFunction &MF = DAG.getMachineFunction(); - MachineFrameInfo &MFI = MF.getFrameInfo(); - Entry.Node = - DAG.getConstant(MFI.getObjectSize(FI->getIndex()), dl, MVT::i64); - } else - Entry.Node = DAG.getConstant(36, dl, MVT::i64); + switch (CC) { + default: + NestReg = 0x0f; // X15 + case CallingConv::ARM64EC_Thunk_Native: + case CallingConv::ARM64EC_Thunk_X64: + // Must be kept in sync with AArch64CallingConv.td + NestReg = 0x04; // X4 + break; + } - Args.push_back(Entry); - Entry.Node = FPtr; - Args.push_back(Entry); - Entry.Node = Nest; - Args.push_back(Entry); + const char FptrReg = 0x11; // X17 - // Lower to a call to __trampoline_setup(Trmp, TrampSize, FPtr, ctx_reg) - TargetLowering::CallLoweringInfo CLI(DAG); - CLI.setDebugLoc(dl).setChain(Chain).setLibCallee( - CallingConv::C, Type::getVoidTy(*DAG.getContext()), - DAG.getExternalSymbol("__trampoline_setup", PtrVT), std::move(Args)); + SDValue Addr = Trmp; - std::pair CallResult = LowerCallTo(CLI); - return CallResult.second; + SDLoc dl(Op); + OutChains[0] = DAG.getStore( + Chain, dl, DAG.getConstant(0x58000080u | NestReg, dl, MVT::i32), Addr, + MachinePointerInfo(TrmpAddr)); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(4, dl, MVT::i64)); + OutChains[1] = DAG.getStore( + Chain, dl, DAG.getConstant(0x580000b0u | FptrReg, dl, MVT::i32), Addr, + MachinePointerInfo(TrmpAddr, 4)); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(8, dl, MVT::i64)); + OutChains[2] = + DAG.getStore(Chain, dl, DAG.getConstant(0xd61f0220u, dl, MVT::i32), Addr, + MachinePointerInfo(TrmpAddr, 8)); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(16, dl, MVT::i64)); + OutChains[3] = + DAG.getStore(Chain, dl, Nest, Addr, MachinePointerInfo(TrmpAddr, 16)); + + Addr = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(24, dl, MVT::i64)); + OutChains[4] = + DAG.getStore(Chain, dl, FPtr, Addr, MachinePointerInfo(TrmpAddr, 24)); + + SDValue StoreToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains); + + SDValue EndOfTrmp = DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp, + DAG.getConstant(12, dl, MVT::i64)); + + // Call clear cache on the trampoline instructions. + return DAG.getNode(ISD::CLEAR_CACHE, dl, MVT::Other, StoreToken, Trmp, + EndOfTrmp); } SDValue AArch64TargetLowering::LowerOperation(SDValue Op, diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp index bd291e1918219..5718ae385bac1 100644 --- a/llvm/lib/TargetParser/Triple.cpp +++ b/llvm/lib/TargetParser/Triple.cpp @@ -1754,8 +1754,6 @@ unsigned Triple::getTrampolineSize() const { if (isOSLinux()) return 48; break; - case Triple::aarch64: - return 36; } return 32; } diff --git a/llvm/test/CodeGen/AArch64/nest-register.ll b/llvm/test/CodeGen/AArch64/nest-register.ll index 1e1c1b044bab6..2e94dfba1fa52 100644 --- a/llvm/test/CodeGen/AArch64/nest-register.ll +++ b/llvm/test/CodeGen/AArch64/nest-register.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -disable-post-ra -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s ; Tests that the 'nest' parameter attribute causes the relevant parameter to be @@ -5,18 +6,21 @@ define ptr @nest_receiver(ptr nest %arg) nounwind { ; CHECK-LABEL: nest_receiver: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: mov x0, x18 -; CHECK-NEXT: ret +; CHECK: // %bb.0: +; CHECK-NEXT: mov x0, x15 +; CHECK-NEXT: ret ret ptr %arg } define ptr @nest_caller(ptr %arg) nounwind { ; CHECK-LABEL: nest_caller: -; CHECK: mov x18, x0 -; CHECK-NEXT: bl nest_receiver -; CHECK: ret +; CHECK: // %bb.0: +; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: mov x15, x0 +; CHECK-NEXT: bl nest_receiver +; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret %result = call ptr @nest_receiver(ptr nest %arg) ret ptr %result diff --git a/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll b/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll index 9619895c450ca..32c3eaeb9c876 100644 --- a/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll +++ b/llvm/test/CodeGen/AArch64/statepoint-call-lowering.ll @@ -207,7 +207,7 @@ define void @test_attributes(ptr byval(%struct2) %s) gc "statepoint-example" { ; CHECK-NEXT: .cfi_offset w30, -16 ; CHECK-NEXT: ldr x8, [sp, #64] ; CHECK-NEXT: ldr q0, [sp, #48] -; CHECK-NEXT: mov x18, xzr +; CHECK-NEXT: mov x15, xzr ; CHECK-NEXT: mov w0, #42 // =0x2a ; CHECK-NEXT: mov w1, #17 // =0x11 ; CHECK-NEXT: str x8, [sp, #16] diff --git a/llvm/test/CodeGen/AArch64/trampoline.ll b/llvm/test/CodeGen/AArch64/trampoline.ll index 30ac2aa283b3e..d9016b02a0f80 100644 --- a/llvm/test/CodeGen/AArch64/trampoline.ll +++ b/llvm/test/CodeGen/AArch64/trampoline.ll @@ -1,32 +1,265 @@ -; RUN: llc -mtriple=aarch64-- < %s | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -mtriple=aarch64-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK-LINUX +; RUN: llc -mtriple=aarch64-none-eabi < %s | FileCheck %s --check-prefixes=CHECK-LINUX +; RUN: llc -mtriple=aarch64-pc-windows-msvc < %s | FileCheck %s --check-prefix=CHECK-PC +; RUN: llc -mtriple=aarch64-apple-darwin < %s | FileCheck %s --check-prefixes=CHECK-APPLE @trampg = internal global [36 x i8] zeroinitializer, align 8 declare void @llvm.init.trampoline(ptr, ptr, ptr); declare ptr @llvm.adjust.trampoline(ptr); -define i64 @f(ptr nest %c, i64 %x, i64 %y) { - %sum = add i64 %x, %y - ret i64 %sum +define ptr @f(ptr nest %x, i64 %y) { +; CHECK-LINUX-LABEL: f: +; CHECK-LINUX: // %bb.0: +; CHECK-LINUX-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-LINUX-NEXT: sub sp, sp, #237, lsl #12 // =970752 +; CHECK-LINUX-NEXT: sub sp, sp, #3264 +; CHECK-LINUX-NEXT: .cfi_def_cfa_offset 974032 +; CHECK-LINUX-NEXT: .cfi_offset w29, -16 +; CHECK-LINUX-NEXT: add x0, x15, x0 +; CHECK-LINUX-NEXT: add sp, sp, #237, lsl #12 // =970752 +; CHECK-LINUX-NEXT: add sp, sp, #3264 +; CHECK-LINUX-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-LINUX-NEXT: ret +; +; CHECK-PC-LABEL: f: +; CHECK-PC: .seh_proc f +; CHECK-PC-NEXT: // %bb.0: +; CHECK-PC-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-PC-NEXT: .seh_save_fplr_x 16 +; CHECK-PC-NEXT: mov x9, x15 +; CHECK-PC-NEXT: mov x15, #60876 // =0xedcc +; CHECK-PC-NEXT: .seh_nop +; CHECK-PC-NEXT: bl __chkstk +; CHECK-PC-NEXT: .seh_nop +; CHECK-PC-NEXT: sub sp, sp, x15, lsl #4 +; CHECK-PC-NEXT: .seh_stackalloc 974016 +; CHECK-PC-NEXT: mov x15, x9 +; CHECK-PC-NEXT: .seh_endprologue +; CHECK-PC-NEXT: add x0, x15, x0 +; CHECK-PC-NEXT: .seh_startepilogue +; CHECK-PC-NEXT: add sp, sp, #237, lsl #12 // =970752 +; CHECK-PC-NEXT: .seh_stackalloc 970752 +; CHECK-PC-NEXT: add sp, sp, #3264 +; CHECK-PC-NEXT: .seh_stackalloc 3264 +; CHECK-PC-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-PC-NEXT: .seh_save_fplr_x 16 +; CHECK-PC-NEXT: .seh_endepilogue +; CHECK-PC-NEXT: ret +; CHECK-PC-NEXT: .seh_endfunclet +; CHECK-PC-NEXT: .seh_endproc +; +; CHECK-APPLE-LABEL: f: +; CHECK-APPLE: ; %bb.0: +; CHECK-APPLE-NEXT: stp x28, x27, [sp, #-16]! ; 16-byte Folded Spill +; CHECK-APPLE-NEXT: sub sp, sp, #237, lsl #12 ; =970752 +; CHECK-APPLE-NEXT: sub sp, sp, #3264 +; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 974032 +; CHECK-APPLE-NEXT: .cfi_offset w27, -8 +; CHECK-APPLE-NEXT: .cfi_offset w28, -16 +; CHECK-APPLE-NEXT: add x0, x15, x0 +; CHECK-APPLE-NEXT: add sp, sp, #237, lsl #12 ; =970752 +; CHECK-APPLE-NEXT: add sp, sp, #3264 +; CHECK-APPLE-NEXT: ldp x28, x27, [sp], #16 ; 16-byte Folded Reload +; CHECK-APPLE-NEXT: ret + %chkstack = alloca [u0xedcba x i8] + %sum = getelementptr i8, ptr %x, i64 %y + ret ptr %sum } define i64 @func1() { +; CHECK-LINUX-LABEL: func1: +; CHECK-LINUX: // %bb.0: +; CHECK-LINUX-NEXT: sub sp, sp, #64 +; CHECK-LINUX-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-LINUX-NEXT: .cfi_def_cfa_offset 64 +; CHECK-LINUX-NEXT: .cfi_offset w30, -16 +; CHECK-LINUX-NEXT: adrp x8, :got:f +; CHECK-LINUX-NEXT: mov w9, #544 // =0x220 +; CHECK-LINUX-NEXT: add x0, sp, #8 +; CHECK-LINUX-NEXT: ldr x8, [x8, :got_lo12:f] +; CHECK-LINUX-NEXT: movk w9, #54815, lsl #16 +; CHECK-LINUX-NEXT: str w9, [sp, #16] +; CHECK-LINUX-NEXT: add x9, sp, #56 +; CHECK-LINUX-NEXT: stp x9, x8, [sp, #24] +; CHECK-LINUX-NEXT: mov x8, #132 // =0x84 +; CHECK-LINUX-NEXT: movk x8, #22528, lsl #16 +; CHECK-LINUX-NEXT: movk x8, #177, lsl #32 +; CHECK-LINUX-NEXT: movk x8, #22528, lsl #48 +; CHECK-LINUX-NEXT: str x8, [sp, #8] +; CHECK-LINUX-NEXT: add x8, sp, #8 +; CHECK-LINUX-NEXT: add x1, x8, #12 +; CHECK-LINUX-NEXT: bl __clear_cache +; CHECK-LINUX-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-LINUX-NEXT: mov x0, xzr +; CHECK-LINUX-NEXT: add sp, sp, #64 +; CHECK-LINUX-NEXT: ret +; +; CHECK-PC-LABEL: func1: +; CHECK-PC: .seh_proc func1 +; CHECK-PC-NEXT: // %bb.0: +; CHECK-PC-NEXT: sub sp, sp, #64 +; CHECK-PC-NEXT: .seh_stackalloc 64 +; CHECK-PC-NEXT: str x30, [sp, #48] // 8-byte Folded Spill +; CHECK-PC-NEXT: .seh_save_reg x30, 48 +; CHECK-PC-NEXT: .seh_endprologue +; CHECK-PC-NEXT: adrp x8, f +; CHECK-PC-NEXT: add x8, x8, :lo12:f +; CHECK-PC-NEXT: add x9, sp, #56 +; CHECK-PC-NEXT: stp x9, x8, [sp, #24] +; CHECK-PC-NEXT: mov w8, #544 // =0x220 +; CHECK-PC-NEXT: add x0, sp, #8 +; CHECK-PC-NEXT: movk w8, #54815, lsl #16 +; CHECK-PC-NEXT: str w8, [sp, #16] +; CHECK-PC-NEXT: mov x8, #132 // =0x84 +; CHECK-PC-NEXT: movk x8, #22528, lsl #16 +; CHECK-PC-NEXT: movk x8, #177, lsl #32 +; CHECK-PC-NEXT: movk x8, #22528, lsl #48 +; CHECK-PC-NEXT: str x8, [sp, #8] +; CHECK-PC-NEXT: add x8, sp, #8 +; CHECK-PC-NEXT: add x1, x8, #12 +; CHECK-PC-NEXT: bl __clear_cache +; CHECK-PC-NEXT: mov x0, xzr +; CHECK-PC-NEXT: .seh_startepilogue +; CHECK-PC-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload +; CHECK-PC-NEXT: .seh_save_reg x30, 48 +; CHECK-PC-NEXT: add sp, sp, #64 +; CHECK-PC-NEXT: .seh_stackalloc 64 +; CHECK-PC-NEXT: .seh_endepilogue +; CHECK-PC-NEXT: ret +; CHECK-PC-NEXT: .seh_endfunclet +; CHECK-PC-NEXT: .seh_endproc +; +; CHECK-APPLE-LABEL: func1: +; CHECK-APPLE: ; %bb.0: +; CHECK-APPLE-NEXT: sub sp, sp, #64 +; CHECK-APPLE-NEXT: stp x29, x30, [sp, #48] ; 16-byte Folded Spill +; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 64 +; CHECK-APPLE-NEXT: .cfi_offset w30, -8 +; CHECK-APPLE-NEXT: .cfi_offset w29, -16 +; CHECK-APPLE-NEXT: Lloh0: +; CHECK-APPLE-NEXT: adrp x8, _f@PAGE +; CHECK-APPLE-NEXT: Lloh1: +; CHECK-APPLE-NEXT: add x8, x8, _f@PAGEOFF +; CHECK-APPLE-NEXT: add x9, sp, #40 +; CHECK-APPLE-NEXT: stp x9, x8, [sp, #16] +; CHECK-APPLE-NEXT: mov w8, #544 ; =0x220 +; CHECK-APPLE-NEXT: mov x0, sp +; CHECK-APPLE-NEXT: movk w8, #54815, lsl #16 +; CHECK-APPLE-NEXT: str w8, [sp, #8] +; CHECK-APPLE-NEXT: mov x8, #132 ; =0x84 +; CHECK-APPLE-NEXT: movk x8, #22528, lsl #16 +; CHECK-APPLE-NEXT: movk x8, #177, lsl #32 +; CHECK-APPLE-NEXT: movk x8, #22528, lsl #48 +; CHECK-APPLE-NEXT: str x8, [sp] +; CHECK-APPLE-NEXT: mov x8, sp +; CHECK-APPLE-NEXT: add x1, x8, #12 +; CHECK-APPLE-NEXT: bl ___clear_cache +; CHECK-APPLE-NEXT: ldp x29, x30, [sp, #48] ; 16-byte Folded Reload +; CHECK-APPLE-NEXT: mov x0, xzr +; CHECK-APPLE-NEXT: add sp, sp, #64 +; CHECK-APPLE-NEXT: ret +; CHECK-APPLE-NEXT: .loh AdrpAdd Lloh0, Lloh1 %val = alloca i64 - %nval = bitcast ptr %val to ptr %tramp = alloca [36 x i8], align 8 - ; CHECK: mov w1, #36 - ; CHECK: bl __trampoline_setup - call void @llvm.init.trampoline(ptr %tramp, ptr @f, ptr %nval) + call void @llvm.init.trampoline(ptr %tramp, ptr @f, ptr %val) %fp = call ptr @llvm.adjust.trampoline(ptr %tramp) ret i64 0 } define i64 @func2() { +; CHECK-LINUX-LABEL: func2: +; CHECK-LINUX: // %bb.0: +; CHECK-LINUX-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-LINUX-NEXT: .cfi_def_cfa_offset 16 +; CHECK-LINUX-NEXT: .cfi_offset w30, -16 +; CHECK-LINUX-NEXT: adrp x8, :got:f +; CHECK-LINUX-NEXT: mov w9, #544 // =0x220 +; CHECK-LINUX-NEXT: adrp x0, trampg +; CHECK-LINUX-NEXT: add x0, x0, :lo12:trampg +; CHECK-LINUX-NEXT: ldr x8, [x8, :got_lo12:f] +; CHECK-LINUX-NEXT: movk w9, #54815, lsl #16 +; CHECK-LINUX-NEXT: str w9, [x0, #8] +; CHECK-LINUX-NEXT: add x9, sp, #8 +; CHECK-LINUX-NEXT: add x1, x0, #12 +; CHECK-LINUX-NEXT: stp x9, x8, [x0, #16] +; CHECK-LINUX-NEXT: mov x8, #132 // =0x84 +; CHECK-LINUX-NEXT: movk x8, #22528, lsl #16 +; CHECK-LINUX-NEXT: movk x8, #177, lsl #32 +; CHECK-LINUX-NEXT: movk x8, #22528, lsl #48 +; CHECK-LINUX-NEXT: str x8, [x0] +; CHECK-LINUX-NEXT: bl __clear_cache +; CHECK-LINUX-NEXT: mov x0, xzr +; CHECK-LINUX-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-LINUX-NEXT: ret +; +; CHECK-PC-LABEL: func2: +; CHECK-PC: .seh_proc func2 +; CHECK-PC-NEXT: // %bb.0: +; CHECK-PC-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill +; CHECK-PC-NEXT: .seh_save_reg_x x30, 16 +; CHECK-PC-NEXT: .seh_endprologue +; CHECK-PC-NEXT: adrp x0, trampg +; CHECK-PC-NEXT: add x0, x0, :lo12:trampg +; CHECK-PC-NEXT: adrp x8, f +; CHECK-PC-NEXT: add x8, x8, :lo12:f +; CHECK-PC-NEXT: add x9, sp, #8 +; CHECK-PC-NEXT: add x1, x0, #12 +; CHECK-PC-NEXT: stp x9, x8, [x0, #16] +; CHECK-PC-NEXT: mov w8, #544 // =0x220 +; CHECK-PC-NEXT: movk w8, #54815, lsl #16 +; CHECK-PC-NEXT: str w8, [x0, #8] +; CHECK-PC-NEXT: mov x8, #132 // =0x84 +; CHECK-PC-NEXT: movk x8, #22528, lsl #16 +; CHECK-PC-NEXT: movk x8, #177, lsl #32 +; CHECK-PC-NEXT: movk x8, #22528, lsl #48 +; CHECK-PC-NEXT: str x8, [x0] +; CHECK-PC-NEXT: bl __clear_cache +; CHECK-PC-NEXT: mov x0, xzr +; CHECK-PC-NEXT: .seh_startepilogue +; CHECK-PC-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload +; CHECK-PC-NEXT: .seh_save_reg_x x30, 16 +; CHECK-PC-NEXT: .seh_endepilogue +; CHECK-PC-NEXT: ret +; CHECK-PC-NEXT: .seh_endfunclet +; CHECK-PC-NEXT: .seh_endproc +; +; CHECK-APPLE-LABEL: func2: +; CHECK-APPLE: ; %bb.0: +; CHECK-APPLE-NEXT: sub sp, sp, #32 +; CHECK-APPLE-NEXT: stp x29, x30, [sp, #16] ; 16-byte Folded Spill +; CHECK-APPLE-NEXT: .cfi_def_cfa_offset 32 +; CHECK-APPLE-NEXT: .cfi_offset w30, -8 +; CHECK-APPLE-NEXT: .cfi_offset w29, -16 +; CHECK-APPLE-NEXT: Lloh2: +; CHECK-APPLE-NEXT: adrp x0, _trampg@PAGE +; CHECK-APPLE-NEXT: Lloh3: +; CHECK-APPLE-NEXT: add x0, x0, _trampg@PAGEOFF +; CHECK-APPLE-NEXT: Lloh4: +; CHECK-APPLE-NEXT: adrp x8, _f@PAGE +; CHECK-APPLE-NEXT: Lloh5: +; CHECK-APPLE-NEXT: add x8, x8, _f@PAGEOFF +; CHECK-APPLE-NEXT: add x9, sp, #8 +; CHECK-APPLE-NEXT: add x1, x0, #12 +; CHECK-APPLE-NEXT: stp x9, x8, [x0, #16] +; CHECK-APPLE-NEXT: mov w8, #544 ; =0x220 +; CHECK-APPLE-NEXT: movk w8, #54815, lsl #16 +; CHECK-APPLE-NEXT: str w8, [x0, #8] +; CHECK-APPLE-NEXT: mov x8, #132 ; =0x84 +; CHECK-APPLE-NEXT: movk x8, #22528, lsl #16 +; CHECK-APPLE-NEXT: movk x8, #177, lsl #32 +; CHECK-APPLE-NEXT: movk x8, #22528, lsl #48 +; CHECK-APPLE-NEXT: str x8, [x0] +; CHECK-APPLE-NEXT: bl ___clear_cache +; CHECK-APPLE-NEXT: ldp x29, x30, [sp, #16] ; 16-byte Folded Reload +; CHECK-APPLE-NEXT: mov x0, xzr +; CHECK-APPLE-NEXT: add sp, sp, #32 +; CHECK-APPLE-NEXT: ret +; CHECK-APPLE-NEXT: .loh AdrpAdd Lloh4, Lloh5 +; CHECK-APPLE-NEXT: .loh AdrpAdd Lloh2, Lloh3 %val = alloca i64 - %nval = bitcast ptr %val to ptr - ; CHECK: mov w1, #36 - ; CHECK: bl __trampoline_setup - call void @llvm.init.trampoline(ptr @trampg, ptr @f, ptr %nval) + call void @llvm.init.trampoline(ptr @trampg, ptr @f, ptr %val) %fp = call ptr @llvm.adjust.trampoline(ptr @trampg) ret i64 0 } diff --git a/llvm/test/CodeGen/AArch64/win64cc-x18.ll b/llvm/test/CodeGen/AArch64/win64cc-x18.ll index b3e78cc9bbb81..4b45c300e9c1d 100644 --- a/llvm/test/CodeGen/AArch64/win64cc-x18.ll +++ b/llvm/test/CodeGen/AArch64/win64cc-x18.ll @@ -1,35 +1,26 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +;; Testing that nest uses x15 on all calling conventions (except Arm64EC) -;; Testing that x18 is not clobbered when passing pointers with the nest -;; attribute on windows - -; RUN: llc < %s -mtriple=aarch64-pc-windows-msvc | FileCheck %s --check-prefixes=CHECK,CHECK-NO-X18 -; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s --check-prefixes=CHECK,CHECK-X18 +; RUN: llc < %s -mtriple=aarch64-pc-windows-msvc | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s +; RUN: llc < %s -mtriple=aarch64-apple-darwin- | FileCheck %s define dso_local i64 @other(ptr nest %p) #0 { ; CHECK-LABEL: other: -; CHECK-X18: ldr x0, [x18] -; CHECK-NO-X18: ldr x0, [x0] +; CHECK: ldr x0, [x15] +; CHECK: ret %r = load i64, ptr %p -; CHECK: ret ret i64 %r } define dso_local void @func() #0 { ; CHECK-LABEL: func: - - +; CHECK: add x15, sp, #8 +; CHECK: bl {{_?other}} +; CHECK: ret entry: %p = alloca i64 -; CHECK: mov w8, #1 -; CHECK: stp x30, x8, [sp, #-16] -; CHECK-X18: add x18, sp, #8 store i64 1, ptr %p -; CHECK-NO-X18: add x0, sp, #8 -; CHECK: bl other call void @other(ptr nest %p) -; CHECK: ldr x30, [sp], #16 -; CHECK: ret ret void } diff --git a/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll b/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll index 4799ea3bcd19f..986666e015e9e 100644 --- a/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll +++ b/llvm/test/CodeGen/AArch64/zero-call-used-regs.ll @@ -93,7 +93,7 @@ define dso_local i32 @all_gpr_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c ; CHECK-NEXT: mov x5, #0 // =0x0 ; CHECK-NEXT: mov x6, #0 // =0x0 ; CHECK-NEXT: mov x7, #0 // =0x0 -; CHECK-NEXT: mov x18, #0 // =0x0 +; CHECK-NEXT: mov x15, #0 // =0x0 ; CHECK-NEXT: orr w0, w8, w2 ; CHECK-NEXT: mov x2, #0 // =0x0 ; CHECK-NEXT: mov x8, #0 // =0x0 @@ -146,7 +146,7 @@ define dso_local i32 @all_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo ; DEFAULT-NEXT: mov x5, #0 // =0x0 ; DEFAULT-NEXT: mov x6, #0 // =0x0 ; DEFAULT-NEXT: mov x7, #0 // =0x0 -; DEFAULT-NEXT: mov x18, #0 // =0x0 +; DEFAULT-NEXT: mov x15, #0 // =0x0 ; DEFAULT-NEXT: movi v0.2d, #0000000000000000 ; DEFAULT-NEXT: orr w0, w8, w2 ; DEFAULT-NEXT: mov x2, #0 // =0x0 @@ -169,7 +169,7 @@ define dso_local i32 @all_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo ; SVE-OR-SME-NEXT: mov x5, #0 // =0x0 ; SVE-OR-SME-NEXT: mov x6, #0 // =0x0 ; SVE-OR-SME-NEXT: mov x7, #0 // =0x0 -; SVE-OR-SME-NEXT: mov x18, #0 // =0x0 +; SVE-OR-SME-NEXT: mov x15, #0 // =0x0 ; SVE-OR-SME-NEXT: mov z0.d, #0 // =0x0 ; SVE-OR-SME-NEXT: orr w0, w8, w2 ; SVE-OR-SME-NEXT: mov x2, #0 // =0x0 @@ -196,7 +196,7 @@ define dso_local i32 @all_arg(i32 noundef %a, i32 noundef %b, i32 noundef %c) lo ; STREAMING-COMPAT-NEXT: mov x5, #0 // =0x0 ; STREAMING-COMPAT-NEXT: mov x6, #0 // =0x0 ; STREAMING-COMPAT-NEXT: mov x7, #0 // =0x0 -; STREAMING-COMPAT-NEXT: mov x18, #0 // =0x0 +; STREAMING-COMPAT-NEXT: mov x15, #0 // =0x0 ; STREAMING-COMPAT-NEXT: fmov d0, xzr ; STREAMING-COMPAT-NEXT: orr w0, w8, w2 ; STREAMING-COMPAT-NEXT: mov x2, #0 // =0x0 @@ -492,7 +492,7 @@ define dso_local double @all_gpr_arg_float(double noundef %a, float noundef %b) ; CHECK-NEXT: mov x6, #0 // =0x0 ; CHECK-NEXT: mov x7, #0 // =0x0 ; CHECK-NEXT: mov x8, #0 // =0x0 -; CHECK-NEXT: mov x18, #0 // =0x0 +; CHECK-NEXT: mov x15, #0 // =0x0 ; CHECK-NEXT: ret entry: @@ -547,7 +547,7 @@ define dso_local double @all_arg_float(double noundef %a, float noundef %b) loca ; DEFAULT-NEXT: mov x6, #0 // =0x0 ; DEFAULT-NEXT: mov x7, #0 // =0x0 ; DEFAULT-NEXT: mov x8, #0 // =0x0 -; DEFAULT-NEXT: mov x18, #0 // =0x0 +; DEFAULT-NEXT: mov x15, #0 // =0x0 ; DEFAULT-NEXT: movi v1.2d, #0000000000000000 ; DEFAULT-NEXT: movi v2.2d, #0000000000000000 ; DEFAULT-NEXT: movi v3.2d, #0000000000000000 @@ -570,7 +570,7 @@ define dso_local double @all_arg_float(double noundef %a, float noundef %b) loca ; SVE-OR-SME-NEXT: mov x6, #0 // =0x0 ; SVE-OR-SME-NEXT: mov x7, #0 // =0x0 ; SVE-OR-SME-NEXT: mov x8, #0 // =0x0 -; SVE-OR-SME-NEXT: mov x18, #0 // =0x0 +; SVE-OR-SME-NEXT: mov x15, #0 // =0x0 ; SVE-OR-SME-NEXT: mov z1.d, #0 // =0x0 ; SVE-OR-SME-NEXT: mov z2.d, #0 // =0x0 ; SVE-OR-SME-NEXT: mov z3.d, #0 // =0x0 @@ -597,7 +597,7 @@ define dso_local double @all_arg_float(double noundef %a, float noundef %b) loca ; STREAMING-COMPAT-NEXT: mov x6, #0 // =0x0 ; STREAMING-COMPAT-NEXT: mov x7, #0 // =0x0 ; STREAMING-COMPAT-NEXT: mov x8, #0 // =0x0 -; STREAMING-COMPAT-NEXT: mov x18, #0 // =0x0 +; STREAMING-COMPAT-NEXT: mov x15, #0 // =0x0 ; STREAMING-COMPAT-NEXT: fmov d1, xzr ; STREAMING-COMPAT-NEXT: fmov d2, xzr ; STREAMING-COMPAT-NEXT: fmov d3, xzr From bb3b8306dc226c4dc4dfde36444b43476eea66ee Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Thu, 12 Jun 2025 10:48:32 +0800 Subject: [PATCH 171/851] [NFC] [C++20] [Modules] Add a test module local declaration lookup From https://github.com/llvm/llvm-project/issues/143734, but it looks good on trunk. Add it as tests are always good. --- .../Modules/module-local-declarations.cppm | 30 +++++++++++++++++++ 1 file changed, 30 insertions(+) create mode 100644 clang/test/Modules/module-local-declarations.cppm diff --git a/clang/test/Modules/module-local-declarations.cppm b/clang/test/Modules/module-local-declarations.cppm new file mode 100644 index 0000000000000..4fbcf09e4d792 --- /dev/null +++ b/clang/test/Modules/module-local-declarations.cppm @@ -0,0 +1,30 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/Base.cppm -emit-module-interface -o %t/Base.pcm +// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm -fprebuilt-module-path=%t +// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fsyntax-only -verify -fprebuilt-module-path=%t + +//--- Base.cppm +export module Base; +export template +class Base {}; + +//--- A.cppm +export module A; +import Base; +struct S {}; + +export Base a; + +//--- B.cppm +// expected-no-diagnostics +export module B; + +import A; +import Base; + +struct S {}; + +export Base b; From de51b2dd3c6fc995e7db56fc50b4c8dceddc0aab Mon Sep 17 00:00:00 2001 From: Jonas Devlieghere Date: Wed, 11 Jun 2025 19:51:05 -0700 Subject: [PATCH 172/851] [lldb] Move Transport class into lldb_private (NFC) (#143806) Move lldb-dap's Transport class into lldb_private so the code can be shared between the "JSON with header" protocol used by DAP and the JSON RPC protocol used by MCP (see [1]). [1]: https://discourse.llvm.org/t/rfc-adding-mcp-support-to-lldb/86798 --- lldb/include/lldb/Host/JSONTransport.h | 126 +++++++++++++++++++ lldb/source/Host/CMakeLists.txt | 3 +- lldb/source/Host/common/JSONTransport.cpp | 147 ++++++++++++++++++++++ lldb/tools/lldb-dap/DAP.cpp | 7 +- lldb/tools/lldb-dap/Transport.cpp | 145 +-------------------- lldb/tools/lldb-dap/Transport.h | 65 ++-------- lldb/unittests/DAP/DAPTest.cpp | 7 +- lldb/unittests/DAP/TestBase.cpp | 3 +- lldb/unittests/DAP/TransportTest.cpp | 16 ++- 9 files changed, 308 insertions(+), 211 deletions(-) create mode 100644 lldb/include/lldb/Host/JSONTransport.h create mode 100644 lldb/source/Host/common/JSONTransport.cpp diff --git a/lldb/include/lldb/Host/JSONTransport.h b/lldb/include/lldb/Host/JSONTransport.h new file mode 100644 index 0000000000000..4db5e417ea852 --- /dev/null +++ b/lldb/include/lldb/Host/JSONTransport.h @@ -0,0 +1,126 @@ +//===-- JSONTransport.h ---------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// Transport layer for encoding and decoding JSON protocol messages. +// +//===----------------------------------------------------------------------===// + +#ifndef LLDB_HOST_JSONTRANSPORT_H +#define LLDB_HOST_JSONTRANSPORT_H + +#include "lldb/lldb-forward.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/FormatVariadic.h" +#include "llvm/Support/JSON.h" +#include +#include + +namespace lldb_private { + +class TransportEOFError : public llvm::ErrorInfo { +public: + static char ID; + + TransportEOFError() = default; + + void log(llvm::raw_ostream &OS) const override { + OS << "transport end of file reached"; + } + std::error_code convertToErrorCode() const override { + return llvm::inconvertibleErrorCode(); + } +}; + +class TransportTimeoutError : public llvm::ErrorInfo { +public: + static char ID; + + TransportTimeoutError() = default; + + void log(llvm::raw_ostream &OS) const override { + OS << "transport operation timed out"; + } + std::error_code convertToErrorCode() const override { + return std::make_error_code(std::errc::timed_out); + } +}; + +class TransportClosedError : public llvm::ErrorInfo { +public: + static char ID; + + TransportClosedError() = default; + + void log(llvm::raw_ostream &OS) const override { + OS << "transport is closed"; + } + std::error_code convertToErrorCode() const override { + return llvm::inconvertibleErrorCode(); + } +}; + +/// A transport class that uses JSON for communication. +class JSONTransport { +public: + JSONTransport(lldb::IOObjectSP input, lldb::IOObjectSP output); + virtual ~JSONTransport() = default; + + /// Transport is not copyable. + /// @{ + JSONTransport(const JSONTransport &rhs) = delete; + void operator=(const JSONTransport &rhs) = delete; + /// @} + + /// Writes a message to the output stream. + template llvm::Error Write(const T &t) { + const std::string message = llvm::formatv("{0}", toJSON(t)).str(); + return WriteImpl(message); + } + + /// Reads the next message from the input stream. + template + llvm::Expected Read(const std::chrono::microseconds &timeout) { + llvm::Expected message = ReadImpl(timeout); + if (!message) + return message.takeError(); + return llvm::json::parse(/*JSON=*/*message); + } + +protected: + virtual void Log(llvm::StringRef message); + + virtual llvm::Error WriteImpl(const std::string &message) = 0; + virtual llvm::Expected + ReadImpl(const std::chrono::microseconds &timeout) = 0; + + lldb::IOObjectSP m_input; + lldb::IOObjectSP m_output; +}; + +/// A transport class for JSON with a HTTP header. +class HTTPDelimitedJSONTransport : public JSONTransport { +public: + HTTPDelimitedJSONTransport(lldb::IOObjectSP input, lldb::IOObjectSP output) + : JSONTransport(input, output) {} + virtual ~HTTPDelimitedJSONTransport() = default; + +protected: + virtual llvm::Error WriteImpl(const std::string &message) override; + virtual llvm::Expected + ReadImpl(const std::chrono::microseconds &timeout) override; + + // FIXME: Support any header. + static constexpr llvm::StringLiteral kHeaderContentLength = + "Content-Length: "; + static constexpr llvm::StringLiteral kHeaderSeparator = "\r\n\r\n"; +}; + +} // namespace lldb_private + +#endif diff --git a/lldb/source/Host/CMakeLists.txt b/lldb/source/Host/CMakeLists.txt index 5b713133afeaf..b15d72e61b6e5 100644 --- a/lldb/source/Host/CMakeLists.txt +++ b/lldb/source/Host/CMakeLists.txt @@ -27,8 +27,9 @@ add_host_subdirectory(common common/HostNativeThreadBase.cpp common/HostProcess.cpp common/HostThread.cpp - common/LockFileBase.cpp + common/JSONTransport.cpp common/LZMA.cpp + common/LockFileBase.cpp common/MainLoopBase.cpp common/MemoryMonitor.cpp common/MonitoringProcessLauncher.cpp diff --git a/lldb/source/Host/common/JSONTransport.cpp b/lldb/source/Host/common/JSONTransport.cpp new file mode 100644 index 0000000000000..103c76d25daf7 --- /dev/null +++ b/lldb/source/Host/common/JSONTransport.cpp @@ -0,0 +1,147 @@ +//===-- JSONTransport.cpp -------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "lldb/Host/JSONTransport.h" +#include "lldb/Utility/IOObject.h" +#include "lldb/Utility/LLDBLog.h" +#include "lldb/Utility/Log.h" +#include "lldb/Utility/SelectHelper.h" +#include "lldb/Utility/Status.h" +#include "lldb/lldb-forward.h" +#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Error.h" +#include "llvm/Support/raw_ostream.h" +#include +#include +#include + +using namespace llvm; +using namespace lldb; +using namespace lldb_private; + +/// ReadFull attempts to read the specified number of bytes. If EOF is +/// encountered, an empty string is returned. +static Expected +ReadFull(IOObject &descriptor, size_t length, + std::optional timeout = std::nullopt) { + if (!descriptor.IsValid()) + return llvm::make_error(); + + bool timeout_supported = true; + // FIXME: SelectHelper does not work with NativeFile on Win32. +#if _WIN32 + timeout_supported = descriptor.GetFdType() == IOObject::eFDTypeSocket; +#endif + + if (timeout && timeout_supported) { + SelectHelper sh; + sh.SetTimeout(*timeout); + sh.FDSetRead(descriptor.GetWaitableHandle()); + Status status = sh.Select(); + if (status.Fail()) { + // Convert timeouts into a specific error. + if (status.GetType() == lldb::eErrorTypePOSIX && + status.GetError() == ETIMEDOUT) + return make_error(); + return status.takeError(); + } + } + + std::string data; + data.resize(length); + Status status = descriptor.Read(data.data(), length); + if (status.Fail()) + return status.takeError(); + + // Read returns '' on EOF. + if (length == 0) + return make_error(); + + // Return the actual number of bytes read. + return data.substr(0, length); +} + +static Expected +ReadUntil(IOObject &descriptor, StringRef delimiter, + std::optional timeout = std::nullopt) { + std::string buffer; + buffer.reserve(delimiter.size() + 1); + while (!llvm::StringRef(buffer).ends_with(delimiter)) { + Expected next = + ReadFull(descriptor, buffer.empty() ? delimiter.size() : 1, timeout); + if (auto Err = next.takeError()) + return std::move(Err); + buffer += *next; + } + return buffer.substr(0, buffer.size() - delimiter.size()); +} + +JSONTransport::JSONTransport(IOObjectSP input, IOObjectSP output) + : m_input(std::move(input)), m_output(std::move(output)) {} + +void JSONTransport::Log(llvm::StringRef message) { + LLDB_LOG(GetLog(LLDBLog::Host), "{0}", message); +} + +Expected +HTTPDelimitedJSONTransport::ReadImpl(const std::chrono::microseconds &timeout) { + if (!m_input || !m_input->IsValid()) + return createStringError("transport output is closed"); + + IOObject *input = m_input.get(); + Expected message_header = + ReadFull(*input, kHeaderContentLength.size(), timeout); + if (!message_header) + return message_header.takeError(); + if (*message_header != kHeaderContentLength) + return createStringError(formatv("expected '{0}' and got '{1}'", + kHeaderContentLength, *message_header) + .str()); + + Expected raw_length = ReadUntil(*input, kHeaderSeparator); + if (!raw_length) + return handleErrors(raw_length.takeError(), + [&](const TransportEOFError &E) -> llvm::Error { + return createStringError( + "unexpected EOF while reading header separator"); + }); + + size_t length; + if (!to_integer(*raw_length, length)) + return createStringError( + formatv("invalid content length {0}", *raw_length).str()); + + Expected raw_json = ReadFull(*input, length); + if (!raw_json) + return handleErrors( + raw_json.takeError(), [&](const TransportEOFError &E) -> llvm::Error { + return createStringError("unexpected EOF while reading JSON"); + }); + + Log(llvm::formatv("--> {0}", *raw_json).str()); + + return raw_json; +} + +Error HTTPDelimitedJSONTransport::WriteImpl(const std::string &message) { + if (!m_output || !m_output->IsValid()) + return llvm::make_error(); + + Log(llvm::formatv("<-- {0}", message).str()); + + std::string Output; + raw_string_ostream OS(Output); + OS << kHeaderContentLength << message.length() << kHeaderSeparator << message; + size_t num_bytes = Output.size(); + return m_output->Write(Output.data(), num_bytes).takeError(); +} + +char TransportEOFError::ID; +char TransportTimeoutError::ID; +char TransportClosedError::ID; diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp index b034c967594ba..9fe8227cd2d6f 100644 --- a/lldb/tools/lldb-dap/DAP.cpp +++ b/lldb/tools/lldb-dap/DAP.cpp @@ -70,6 +70,7 @@ using namespace lldb_dap; using namespace lldb_dap::protocol; +using namespace lldb_private; namespace { #ifdef _WIN32 @@ -893,14 +894,14 @@ llvm::Error DAP::Loop() { while (!disconnecting) { llvm::Expected next = - transport.Read(std::chrono::seconds(1)); - if (next.errorIsA()) { + transport.Read(std::chrono::seconds(1)); + if (next.errorIsA()) { consumeError(next.takeError()); break; } // If the read timed out, continue to check if we should disconnect. - if (next.errorIsA()) { + if (next.errorIsA()) { consumeError(next.takeError()); continue; } diff --git a/lldb/tools/lldb-dap/Transport.cpp b/lldb/tools/lldb-dap/Transport.cpp index 4e322e9ff1358..d602920da34e3 100644 --- a/lldb/tools/lldb-dap/Transport.cpp +++ b/lldb/tools/lldb-dap/Transport.cpp @@ -8,152 +8,19 @@ #include "Transport.h" #include "DAPLog.h" -#include "Protocol/ProtocolBase.h" -#include "lldb/Utility/IOObject.h" -#include "lldb/Utility/SelectHelper.h" -#include "lldb/Utility/Status.h" #include "lldb/lldb-forward.h" -#include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Support/Error.h" -#include "llvm/Support/raw_ostream.h" -#include -#include -#include using namespace llvm; using namespace lldb; using namespace lldb_private; using namespace lldb_dap; -using namespace lldb_dap::protocol; -/// ReadFull attempts to read the specified number of bytes. If EOF is -/// encountered, an empty string is returned. -static Expected -ReadFull(IOObject &descriptor, size_t length, - std::optional timeout = std::nullopt) { - if (!descriptor.IsValid()) - return createStringError("transport output is closed"); +Transport::Transport(llvm::StringRef client_name, lldb_dap::Log *log, + lldb::IOObjectSP input, lldb::IOObjectSP output) + : HTTPDelimitedJSONTransport(input, output), m_client_name(client_name), + m_log(log) {} - bool timeout_supported = true; - // FIXME: SelectHelper does not work with NativeFile on Win32. -#if _WIN32 - timeout_supported = descriptor.GetFdType() == IOObject::eFDTypeSocket; -#endif - - if (timeout && timeout_supported) { - SelectHelper sh; - sh.SetTimeout(*timeout); - sh.FDSetRead(descriptor.GetWaitableHandle()); - Status status = sh.Select(); - if (status.Fail()) { - // Convert timeouts into a specific error. - if (status.GetType() == lldb::eErrorTypePOSIX && - status.GetError() == ETIMEDOUT) - return make_error(); - return status.takeError(); - } - } - - std::string data; - data.resize(length); - Status status = descriptor.Read(data.data(), length); - if (status.Fail()) - return status.takeError(); - - // Read returns '' on EOF. - if (length == 0) - return make_error(); - - // Return the actual number of bytes read. - return data.substr(0, length); -} - -static Expected -ReadUntil(IOObject &descriptor, StringRef delimiter, - std::optional timeout = std::nullopt) { - std::string buffer; - buffer.reserve(delimiter.size() + 1); - while (!llvm::StringRef(buffer).ends_with(delimiter)) { - Expected next = - ReadFull(descriptor, buffer.empty() ? delimiter.size() : 1, timeout); - if (auto Err = next.takeError()) - return std::move(Err); - buffer += *next; - } - return buffer.substr(0, buffer.size() - delimiter.size()); -} - -/// DAP message format -/// ``` -/// Content-Length: (?\d+)\r\n\r\n(?.{\k}) -/// ``` -static constexpr StringLiteral kHeaderContentLength = "Content-Length: "; -static constexpr StringLiteral kHeaderSeparator = "\r\n\r\n"; - -namespace lldb_dap { - -char EndOfFileError::ID; -char TimeoutError::ID; - -Transport::Transport(StringRef client_name, Log *log, IOObjectSP input, - IOObjectSP output) - : m_client_name(client_name), m_log(log), m_input(std::move(input)), - m_output(std::move(output)) {} - -Expected Transport::Read(const std::chrono::microseconds &timeout) { - if (!m_input || !m_input->IsValid()) - return createStringError("transport output is closed"); - - IOObject *input = m_input.get(); - Expected message_header = - ReadFull(*input, kHeaderContentLength.size(), timeout); - if (!message_header) - return message_header.takeError(); - if (*message_header != kHeaderContentLength) - return createStringError(formatv("expected '{0}' and got '{1}'", - kHeaderContentLength, *message_header) - .str()); - - Expected raw_length = ReadUntil(*input, kHeaderSeparator); - if (!raw_length) - return handleErrors(raw_length.takeError(), - [&](const EndOfFileError &E) -> llvm::Error { - return createStringError( - "unexpected EOF while reading header separator"); - }); - - size_t length; - if (!to_integer(*raw_length, length)) - return createStringError( - formatv("invalid content length {0}", *raw_length).str()); - - Expected raw_json = ReadFull(*input, length); - if (!raw_json) - return handleErrors( - raw_json.takeError(), [&](const EndOfFileError &E) -> llvm::Error { - return createStringError("unexpected EOF while reading JSON"); - }); - - DAP_LOG(m_log, "--> ({0}) {1}", m_client_name, *raw_json); - - return json::parse(/*JSON=*/*raw_json, - /*RootName=*/"protocol_message"); +void Transport::Log(llvm::StringRef message) { + DAP_LOG(m_log, "({0}) {1}", m_client_name, message); } - -Error Transport::Write(const Message &message) { - if (!m_output || !m_output->IsValid()) - return createStringError("transport output is closed"); - - std::string json = formatv("{0}", toJSON(message)).str(); - - DAP_LOG(m_log, "<-- ({0}) {1}", m_client_name, json); - - std::string Output; - raw_string_ostream OS(Output); - OS << kHeaderContentLength << json.length() << kHeaderSeparator << json; - size_t num_bytes = Output.size(); - return m_output->Write(Output.data(), num_bytes).takeError(); -} - -} // end namespace lldb_dap diff --git a/lldb/tools/lldb-dap/Transport.h b/lldb/tools/lldb-dap/Transport.h index 4e347eaa51314..51f62e718a0d0 100644 --- a/lldb/tools/lldb-dap/Transport.h +++ b/lldb/tools/lldb-dap/Transport.h @@ -15,70 +15,21 @@ #define LLDB_TOOLS_LLDB_DAP_TRANSPORT_H #include "DAPForward.h" -#include "Protocol/ProtocolBase.h" +#include "lldb/Host/JSONTransport.h" #include "lldb/lldb-forward.h" #include "llvm/ADT/StringRef.h" -#include "llvm/Support/Error.h" -#include -#include namespace lldb_dap { -class EndOfFileError : public llvm::ErrorInfo { -public: - static char ID; - - EndOfFileError() = default; - - void log(llvm::raw_ostream &OS) const override { - OS << "end of file reached"; - } - std::error_code convertToErrorCode() const override { - return llvm::inconvertibleErrorCode(); - } -}; - -class TimeoutError : public llvm::ErrorInfo { -public: - static char ID; - - TimeoutError() = default; - - void log(llvm::raw_ostream &OS) const override { - OS << "operation timed out"; - } - std::error_code convertToErrorCode() const override { - return std::make_error_code(std::errc::timed_out); - } -}; - /// A transport class that performs the Debug Adapter Protocol communication /// with the client. -class Transport { +class Transport : public lldb_private::HTTPDelimitedJSONTransport { public: - Transport(llvm::StringRef client_name, Log *log, lldb::IOObjectSP input, - lldb::IOObjectSP output); - ~Transport() = default; - - /// Transport is not copyable. - /// @{ - Transport(const Transport &rhs) = delete; - void operator=(const Transport &rhs) = delete; - /// @} - - /// Writes a Debug Adater Protocol message to the output stream. - llvm::Error Write(const protocol::Message &M); + Transport(llvm::StringRef client_name, lldb_dap::Log *log, + lldb::IOObjectSP input, lldb::IOObjectSP output); + virtual ~Transport() = default; - /// Reads the next Debug Adater Protocol message from the input stream. - /// - /// \param timeout[in] - /// A timeout to wait for reading the initial header. Once a message - /// header is recieved, this will block until the full message is - /// read. - /// - /// \returns Returns the next protocol message. - llvm::Expected - Read(const std::chrono::microseconds &timeout); + virtual void Log(llvm::StringRef message) override; /// Returns the name of this transport client, for example `stdin/stdout` or /// `client_1`. @@ -86,9 +37,7 @@ class Transport { private: llvm::StringRef m_client_name; - Log *m_log; - lldb::IOObjectSP m_input; - lldb::IOObjectSP m_output; + lldb_dap::Log *m_log; }; } // namespace lldb_dap diff --git a/lldb/unittests/DAP/DAPTest.cpp b/lldb/unittests/DAP/DAPTest.cpp index 5fb6bf7e564ab..40ffaf87c9c45 100644 --- a/lldb/unittests/DAP/DAPTest.cpp +++ b/lldb/unittests/DAP/DAPTest.cpp @@ -32,7 +32,8 @@ TEST_F(DAPTest, SendProtocolMessages) { /*transport=*/*to_dap, }; dap.Send(Event{/*event=*/"my-event", /*body=*/std::nullopt}); - ASSERT_THAT_EXPECTED(from_dap->Read(std::chrono::milliseconds(1)), - HasValue(testing::VariantWith(testing::FieldsAre( - /*event=*/"my-event", /*body=*/std::nullopt)))); + ASSERT_THAT_EXPECTED( + from_dap->Read(std::chrono::milliseconds(1)), + HasValue(testing::VariantWith(testing::FieldsAre( + /*event=*/"my-event", /*body=*/std::nullopt)))); } diff --git a/lldb/unittests/DAP/TestBase.cpp b/lldb/unittests/DAP/TestBase.cpp index 388d1b901507e..4063b34250312 100644 --- a/lldb/unittests/DAP/TestBase.cpp +++ b/lldb/unittests/DAP/TestBase.cpp @@ -122,7 +122,8 @@ std::vector DAPTestBase::DrainOutput() { std::vector msgs; output.CloseWriteFileDescriptor(); while (true) { - Expected next = from_dap->Read(std::chrono::milliseconds(1)); + Expected next = + from_dap->Read(std::chrono::milliseconds(1)); if (!next) { consumeError(next.takeError()); break; diff --git a/lldb/unittests/DAP/TransportTest.cpp b/lldb/unittests/DAP/TransportTest.cpp index e6dab42e30941..aaf257993af23 100644 --- a/lldb/unittests/DAP/TransportTest.cpp +++ b/lldb/unittests/DAP/TransportTest.cpp @@ -26,6 +26,8 @@ using namespace lldb_dap::protocol; using lldb_private::File; using lldb_private::NativeFile; using lldb_private::Pipe; +using lldb_private::TransportEOFError; +using lldb_private::TransportTimeoutError; class TransportTest : public PipeBase { protected: @@ -50,7 +52,7 @@ TEST_F(TransportTest, MalformedRequests) { input.Write(malformed_header.data(), malformed_header.size()), Succeeded()); ASSERT_THAT_EXPECTED( - transport->Read(std::chrono::milliseconds(1)), + transport->Read(std::chrono::milliseconds(1)), FailedWithMessage( "expected 'Content-Length: ' and got 'COnTent-LenGth: '")); } @@ -63,20 +65,22 @@ TEST_F(TransportTest, Read) { ASSERT_THAT_EXPECTED(input.Write(message.data(), message.size()), Succeeded()); ASSERT_THAT_EXPECTED( - transport->Read(std::chrono::milliseconds(1)), + transport->Read(std::chrono::milliseconds(1)), HasValue(testing::VariantWith(testing::FieldsAre( /*seq=*/1, /*command=*/"abc", /*arguments=*/std::nullopt)))); } TEST_F(TransportTest, ReadWithTimeout) { - ASSERT_THAT_EXPECTED(transport->Read(std::chrono::milliseconds(1)), - Failed()); + ASSERT_THAT_EXPECTED( + transport->Read(std::chrono::milliseconds(1)), + Failed()); } TEST_F(TransportTest, ReadWithEOF) { input.CloseWriteFileDescriptor(); - ASSERT_THAT_EXPECTED(transport->Read(std::chrono::milliseconds(1)), - Failed()); + ASSERT_THAT_EXPECTED( + transport->Read(std::chrono::milliseconds(1)), + Failed()); } TEST_F(TransportTest, Write) { From faa49d6662b4c14438cc8e63a3751c22f28d2481 Mon Sep 17 00:00:00 2001 From: LLVM GN Syncbot Date: Thu, 12 Jun 2025 02:53:03 +0000 Subject: [PATCH 173/851] [gn build] Port de51b2dd3c6f --- llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn index ca1acf9ba8aa4..b00442d8e1ebb 100644 --- a/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn +++ b/llvm/utils/gn/secondary/lldb/source/Host/BUILD.gn @@ -27,6 +27,7 @@ static_library("Host") { "common/HostNativeThreadBase.cpp", "common/HostProcess.cpp", "common/HostThread.cpp", + "common/JSONTransport.cpp", "common/LZMA.cpp", "common/LockFileBase.cpp", "common/MainLoopBase.cpp", From d8118ed6db28a3caaf3fa4a4f8d0d51d33b09c30 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 11 Jun 2025 20:00:45 -0700 Subject: [PATCH 174/851] [ELF,test] Improve weak-undef-rw.s --- lld/test/ELF/weak-undef-rw.s | 54 +++++++++++++++++++++++------------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/lld/test/ELF/weak-undef-rw.s b/lld/test/ELF/weak-undef-rw.s index bbc37ba49304a..902cad87aba9a 100644 --- a/lld/test/ELF/weak-undef-rw.s +++ b/lld/test/ELF/weak-undef-rw.s @@ -3,12 +3,17 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64 a.s -o a.o # RUN: llvm-mc -filetype=obj -triple=x86_64 b.s -o b.o # RUN: llvm-mc -filetype=obj -triple=x86_64 c.s -o c.o -# RUN: ld.lld a.o -o nopie --export-dynamic -# RUN: llvm-readelf -r --hex-dump=.data nopie | FileCheck %s --check-prefix=STATIC -# RUN: ld.lld a.o -o out.pie -pie -# RUN: llvm-readelf -r --hex-dump=.data out.pie | FileCheck %s --check-prefix=STATIC -# RUN: ld.lld a.o -o out.so -shared -# RUN: llvm-readobj -r out.so | FileCheck %s --check-prefix=PIC +# RUN: llvm-mc -filetype=obj -triple=x86_64 %S/Inputs/shared.s -o s.o +# RUN: ld.lld -shared s.o -o s.so + +# RUN: ld.lld a.o -o a --export-dynamic +# RUN: llvm-readelf -r --hex-dump=.data a | FileCheck %s --check-prefix=STATIC +# RUN: ld.lld a.o s.so -o as +# RUN: llvm-readelf -r --hex-dump=.data as | FileCheck %s --check-prefix=STATIC +# RUN: ld.lld a.o -o a.pie -pie +# RUN: llvm-readelf -r --hex-dump=.data a.pie | FileCheck %s --check-prefix=STATIC +# RUN: ld.lld a.o -o a.so -shared +# RUN: llvm-readelf -r a.so | FileCheck %s --check-prefix=DYN ## gABI leaves the behavior of weak undefined references implementation defined. ## We choose to resolve them statically for static linking and produce dynamic relocations @@ -19,35 +24,44 @@ # STATIC: no relocations # STATIC: Hex dump of section '.data': -# STATIC-NEXT: {{.*}} 00000000 00000000 . +# STATIC-NEXT: {{.*}} 00000000 00000000 03000000 00000000 . # STATIC-EMPTY: -# PIC: .rela.dyn { -# PIC-NEXT: R_X86_64_64 foobar 0x0 -# PIC-NEXT: } +# DYN: Relocation section '.rela.dyn' {{.*}} contains 2 +# DYN: R_X86_64_64 0000000000000000 foobar + 0{{$}} -# RUN: ld.lld a.o b.o -o out1 -z undefs -# RUN: llvm-readelf -r -x .data out1 | FileCheck %s --check-prefix=STATIC1 -# RUN: ld.lld a.o b.o -o out1.pie -pie -z undefs -# RUN: llvm-readelf -r -x .data out1.pie | FileCheck %s --check-prefix=STATIC1 +# RUN: ld.lld a.o b.o -o ab -z undefs +# RUN: llvm-readelf -r -x .data ab | FileCheck %s --check-prefix=STATIC1 +# RUN: ld.lld a.o b.o s.so -o abs -z undefs +# RUN: llvm-readelf -r -x .data abs | FileCheck %s --check-prefix=DYN1 +# RUN: ld.lld a.o b.o -o abs.pie -pie -z undefs +# RUN: llvm-readelf -r -x .data abs.pie | FileCheck %s --check-prefix=STATIC1 # STATIC1: no relocations # STATIC1: Hex dump of section '.data': -# STATIC1-NEXT: {{.*}} 00000000 00000000 00000000 00000000 . +# STATIC1-NEXT: {{.*}} 00000000 00000000 03000000 00000000 . +# STATIC1-NEXT: {{.*}} 05000000 00000000 . # STATIC1-EMPTY: +# DYN1: Relocation section '.rela.dyn' {{.*}} contains 1 +# DYN1: Hex dump of section '.data': +# DYN1-NEXT: {{.*}} 00000000 00000000 03000000 00000000 . +# DYN1-NEXT: {{.*}} 00000000 00000000 . +# DYN1-EMPTY: + # RUN: ld.lld a.o b.o c.o -pie -z undefs 2>&1 | count 0 #--- a.s - .global _start +.global _start _start: - .data - .weak foobar - .quad foobar +.data +.weak foobar +.quad foobar +.quad foobar+3 #--- b.s .data -.quad undef +.quad undef+5 #--- c.s call undef From b46f34452e9dec50eee6ddbe07875f05e421a81c Mon Sep 17 00:00:00 2001 From: Khem Raj Date: Wed, 11 Jun 2025 20:22:08 -0700 Subject: [PATCH 175/851] libunwind: Do not use __attribute__((target("gcs"))) with non-clang compilers (#138077) This attribute is unsupported in GCC, so far it worked because before GCC15 did not define this macros in _CHKFEAT_GCS in arm_acle.h [1] With gcc15 compiler libunwind's check for this macros is succeeding and it ends up enabling 'gcs' by using function attribute, this works with clang but not with gcc. We can see this in rust compiler bootstrap for aarch64/musl when system uses gcc15, it ends up with these errors Building libunwind.a for aarch64-poky-linux-musl ``` cargo:warning=/mnt/b/yoe/master/sources/poky/build/tmp/work/cortexa57-poky-linux-musl/rust/1.85.1/rustc-1.85.1-src/src/llvm-project/libunwind/src/UnwindLevel1.c:191:1: error: arch extension 'gcs' should be prefixed by '+' cargo:warning= 191 | unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *exception_object) { cargo:warning= | ^~~~~~~~~~~~~ cargo:warning=/mnt/b/yoe/master/sources/poky/build/tmp/work/cortexa57-poky-linux-musl/rust/1.85.1/rustc-1.85.1-src/src/llvm-project/libunwind/src/UnwindLevel1.c:337:22: error: arch extension 'gcs' should be prefixed by '+' cargo:warning= 337 | _Unwind_Stop_Fn stop, void *stop_parameter) { cargo:warning= | ^~~~~~~~~~~~~~~ ``` [1] https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=5a6af707f0af Signed-off-by: Khem Raj --- libunwind/src/UnwindLevel1.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libunwind/src/UnwindLevel1.c b/libunwind/src/UnwindLevel1.c index a258a832a9c31..f3b451ad9b730 100644 --- a/libunwind/src/UnwindLevel1.c +++ b/libunwind/src/UnwindLevel1.c @@ -188,10 +188,11 @@ extern int __unw_step_stage2(unw_cursor_t *); #if defined(_LIBUNWIND_USE_GCS) // Enable the GCS target feature to permit gcspop instructions to be used. -__attribute__((target("gcs"))) +__attribute__((target("+gcs"))) #endif static _Unwind_Reason_Code -unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *exception_object) { +unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, + _Unwind_Exception *exception_object) { __unw_init_local(cursor, uc); _LIBUNWIND_TRACE_UNWINDING("unwind_phase2(ex_obj=%p)", @@ -332,12 +333,12 @@ unwind_phase2(unw_context_t *uc, unw_cursor_t *cursor, _Unwind_Exception *except #if defined(_LIBUNWIND_USE_GCS) // Enable the GCS target feature to permit gcspop instructions to be used. -__attribute__((target("gcs"))) +__attribute__((target("+gcs"))) #endif static _Unwind_Reason_Code unwind_phase2_forced(unw_context_t *uc, unw_cursor_t *cursor, - _Unwind_Exception *exception_object, - _Unwind_Stop_Fn stop, void *stop_parameter) { + _Unwind_Exception *exception_object, _Unwind_Stop_Fn stop, + void *stop_parameter) { __unw_init_local(cursor, uc); // uc is initialized by __unw_getcontext in the parent frame. The first stack @@ -443,7 +444,6 @@ unwind_phase2_forced(unw_context_t *uc, unw_cursor_t *cursor, return _URC_FATAL_PHASE2_ERROR; } - /// Called by __cxa_throw. Only returns if there is a fatal error. _LIBUNWIND_EXPORT _Unwind_Reason_Code _Unwind_RaiseException(_Unwind_Exception *exception_object) { From a71210e5abdbae80363cb5956a24a2004f625ca6 Mon Sep 17 00:00:00 2001 From: Kewen12 Date: Wed, 11 Jun 2025 20:24:56 -0700 Subject: [PATCH 176/851] Revert "[libc] Fix stdio tests after #143802" (#143824) Reverts llvm/llvm-project#143810 This PR breaks our buildbot: https://lab.llvm.org/buildbot/#/builders/10/builds/7159 revert to unblock downstream merge. --- libc/docs/configure.rst | 2 +- libc/test/src/stdio/fgetc_test.cpp | 1 - libc/test/src/stdio/fgetc_unlocked_test.cpp | 1 - libc/test/src/stdio/fgets_test.cpp | 1 - libc/test/src/stdio/setvbuf_test.cpp | 1 - 5 files changed, 1 insertion(+), 5 deletions(-) diff --git a/libc/docs/configure.rst b/libc/docs/configure.rst index 109412225634f..8d53390ae19bf 100644 --- a/libc/docs/configure.rst +++ b/libc/docs/configure.rst @@ -29,7 +29,7 @@ to learn about the defaults for your platform and target. - ``LIBC_CONF_ENABLE_STRONG_STACK_PROTECTOR``: Enable -fstack-protector-strong to defend against stack smashing attack. - ``LIBC_CONF_KEEP_FRAME_POINTER``: Keep frame pointer in functions for better debugging experience. * **"errno" options** - - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, LIBC_ERRNO_MODE_SYSTEM, and LIBC_ERRNO_MODE_SYSTEM_INLINE. + - ``LIBC_CONF_ERRNO_MODE``: The implementation used for errno, acceptable values are LIBC_ERRNO_MODE_DEFAULT, LIBC_ERRNO_MODE_UNDEFINED, LIBC_ERRNO_MODE_THREAD_LOCAL, LIBC_ERRNO_MODE_SHARED, LIBC_ERRNO_MODE_EXTERNAL, and LIBC_ERRNO_MODE_SYSTEM. * **"general" options** - ``LIBC_ADD_NULL_CHECKS``: Add nullptr checks in the library's implementations to some functions for which passing nullptr is undefined behavior. * **"math" options** diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp index 1faa49112fb63..7c652f666a8f3 100644 --- a/libc/test/src/stdio/fgetc_test.cpp +++ b/libc/test/src/stdio/fgetc_test.cpp @@ -33,7 +33,6 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - ASSERT_ERRNO_FAILURE(); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp index 7b2efe642fb5e..f4471dd82df15 100644 --- a/libc/test/src/stdio/fgetc_unlocked_test.cpp +++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp @@ -36,7 +36,6 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - ASSERT_ERRNO_FAILURE(); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp index 2d7c68d490811..c00a9256af52d 100644 --- a/libc/test/src/stdio/fgets_test.cpp +++ b/libc/test/src/stdio/fgets_test.cpp @@ -36,7 +36,6 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); - ASSERT_ERRNO_FAILURE(); ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp index a0936ba79ef73..4144bc1bef447 100644 --- a/libc/test/src/stdio/setvbuf_test.cpp +++ b/libc/test/src/stdio/setvbuf_test.cpp @@ -11,7 +11,6 @@ #include "src/stdio/fread.h" #include "src/stdio/fwrite.h" #include "src/stdio/setvbuf.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" From 968d8eaa44c500259fe8d56ad77ec1c71cad35e2 Mon Sep 17 00:00:00 2001 From: Yang Zaizhou <91008302+Mxfg-incense@users.noreply.github.com> Date: Thu, 12 Jun 2025 11:28:57 +0800 Subject: [PATCH 177/851] [OpenMP][Flang]Fix omp_get_cancellation return type from integer to logical (#142990) --- openmp/runtime/src/include/omp_lib.F90.var | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/openmp/runtime/src/include/omp_lib.F90.var b/openmp/runtime/src/include/omp_lib.F90.var index 3463b698291e1..20639f60b5d97 100644 --- a/openmp/runtime/src/include/omp_lib.F90.var +++ b/openmp/runtime/src/include/omp_lib.F90.var @@ -399,7 +399,7 @@ function omp_get_cancellation() bind(c) use omp_lib_kinds - integer (kind=omp_integer_kind) omp_get_cancellation + logical (kind=omp_logical_kind) omp_get_cancellation end function omp_get_cancellation function omp_is_initial_device() bind(c) From 2fcaa00d1e2317a90c9071b735eb0e758b5dd58b Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 11 Jun 2025 20:37:15 -0700 Subject: [PATCH 178/851] [ELF] -z undefs: handle relocations referencing undefined non-weak like undefined weak * Merge the special case into isStaticLinkTimeConstant * Generalize isUndefWeak to isUndefined. undefined non-weak is an error case. We choose to be general, which also brings us in line with GNU ld. --- lld/ELF/Relocations.cpp | 25 ++++++++++--------------- lld/test/ELF/weak-undef-rw.s | 12 +++++++----- 2 files changed, 17 insertions(+), 20 deletions(-) diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp index 1af01e7247dce..6c4209a2b81ed 100644 --- a/lld/ELF/Relocations.cpp +++ b/lld/ELF/Relocations.cpp @@ -990,10 +990,17 @@ bool RelocationScanner::isStaticLinkTimeConstant(RelExpr e, RelType type, // only the low bits are used. if (e == R_GOT || e == R_PLT) return ctx.target->usesOnlyLowPageBits(type) || !ctx.arg.isPic; - // R_AARCH64_AUTH_ABS64 requires a dynamic relocation. - if (sym.isPreemptible || e == RE_AARCH64_AUTH) + if (e == RE_AARCH64_AUTH) return false; + + // The behavior of an undefined weak reference is implementation defined. + // (We treat undefined non-weak the same as undefined weak.) For static + // -no-pie linking, dynamic relocations are generally avoided (except + // IRELATIVE). Emitting dynamic relocations for -shared aligns with its -z + // undefs default. Dynamic -no-pie linking and -pie allow flexibility. + if (sym.isPreemptible) + return sym.isUndefined() && !ctx.arg.isPic; if (!ctx.arg.isPic) return true; @@ -1113,19 +1120,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset, // If the relocation is known to be a link-time constant, we know no dynamic // relocation will be created, pass the control to relocateAlloc() or // relocateNonAlloc() to resolve it. - // - // The behavior of an undefined weak reference is implementation defined. For - // non-link-time constants, we resolve relocations statically (let - // relocate{,Non}Alloc() resolve them) for -no-pie and try producing dynamic - // relocations for -pie and -shared. - // - // The general expectation of -no-pie static linking is that there is no - // dynamic relocation (except IRELATIVE). Emitting dynamic relocations for - // -shared matches the spirit of its -z undefs default. -pie has freedom on - // choices, and we choose dynamic relocations to be consistent with the - // handling of GOT-generating relocations. - if (isStaticLinkTimeConstant(expr, type, sym, offset) || - (!ctx.arg.isPic && sym.isUndefWeak())) { + if (isStaticLinkTimeConstant(expr, type, sym, offset)) { sec->addReloc({expr, type, offset, addend, &sym}); return; } diff --git a/lld/test/ELF/weak-undef-rw.s b/lld/test/ELF/weak-undef-rw.s index 902cad87aba9a..497228a3cf905 100644 --- a/lld/test/ELF/weak-undef-rw.s +++ b/lld/test/ELF/weak-undef-rw.s @@ -33,9 +33,11 @@ # RUN: ld.lld a.o b.o -o ab -z undefs # RUN: llvm-readelf -r -x .data ab | FileCheck %s --check-prefix=STATIC1 # RUN: ld.lld a.o b.o s.so -o abs -z undefs -# RUN: llvm-readelf -r -x .data abs | FileCheck %s --check-prefix=DYN1 -# RUN: ld.lld a.o b.o -o abs.pie -pie -z undefs -# RUN: llvm-readelf -r -x .data abs.pie | FileCheck %s --check-prefix=STATIC1 +# RUN: llvm-readelf -r -x .data abs | FileCheck %s --check-prefix=STATIC1 +# RUN: ld.lld a.o b.o -o ab.pie -pie -z undefs +# RUN: llvm-readelf -r -x .data ab.pie | FileCheck %s --check-prefix=STATIC1 +# RUN: ld.lld a.o b.o s.so -o abs.pie -pie -z undefs +# RUN: llvm-readelf -r -x .data abs.pie | FileCheck %s --check-prefix=DYN1 # STATIC1: no relocations # STATIC1: Hex dump of section '.data': @@ -43,9 +45,9 @@ # STATIC1-NEXT: {{.*}} 05000000 00000000 . # STATIC1-EMPTY: -# DYN1: Relocation section '.rela.dyn' {{.*}} contains 1 +# DYN1: Relocation section '.rela.dyn' {{.*}} contains 3 # DYN1: Hex dump of section '.data': -# DYN1-NEXT: {{.*}} 00000000 00000000 03000000 00000000 . +# DYN1-NEXT: {{.*}} 00000000 00000000 00000000 00000000 . # DYN1-NEXT: {{.*}} 00000000 00000000 . # DYN1-EMPTY: From 5f231db76482bbdd3e658d8e9797cbd46837d4e1 Mon Sep 17 00:00:00 2001 From: Brandon Wu Date: Thu, 12 Jun 2025 11:41:52 +0800 Subject: [PATCH 179/851] [RISCV] Use StringRef for RequiredExtensions in RVVIntrinsicDef (#143503) This prevents many duplicated copies of required extensions string. --- clang/lib/Sema/SemaRISCV.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp index 9f70be746eb3f..9eab0c2a0df6a 100644 --- a/clang/lib/Sema/SemaRISCV.cpp +++ b/clang/lib/Sema/SemaRISCV.cpp @@ -47,7 +47,7 @@ struct RVVIntrinsicDef { std::string BuiltinName; /// Mapping to RequiredFeatures in riscv_vector.td - std::string RequiredExtensions; + StringRef RequiredExtensions; /// Function signature, first element is return type. RVVTypes Signature; From f09050fdc85074869f0b34f0d9e061a74ef549ee Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Thu, 12 Jun 2025 11:35:44 +0800 Subject: [PATCH 180/851] [C++20] [Modules] Fix module local lookup ambiguousity Close https://github.com/llvm/llvm-project/issues/61360 Close https://github.com/llvm/llvm-project/issues/129525 Close https://github.com/llvm/llvm-project/issues/143734 We shouldn't identify different module local decls in different modules as the same entity. --- clang/include/clang/AST/ASTContext.h | 6 ++-- clang/include/clang/AST/DeclBase.h | 4 +++ clang/lib/AST/ASTContext.cpp | 8 ++++- clang/lib/AST/DeclBase.cpp | 6 ++++ .../Modules/module-local-declarations-02.cppm | 31 +++++++++++++++++++ clang/test/Modules/pr61360.cppm | 25 +++++++++++++++ 6 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 clang/test/Modules/module-local-declarations-02.cppm create mode 100644 clang/test/Modules/pr61360.cppm diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h index 8d24d393eab09..3abb49312255a 100644 --- a/clang/include/clang/AST/ASTContext.h +++ b/clang/include/clang/AST/ASTContext.h @@ -488,8 +488,8 @@ class ASTContext : public RefCountedBase { /// if possible. /// /// Not serialized intentionally. - llvm::StringMap PrimaryModuleNameMap; - llvm::DenseMap SameModuleLookupSet; + mutable llvm::StringMap PrimaryModuleNameMap; + mutable llvm::DenseMap SameModuleLookupSet; static constexpr unsigned ConstantArrayTypesLog2InitSize = 8; static constexpr unsigned GeneralTypesLog2InitSize = 9; @@ -1151,7 +1151,7 @@ class ASTContext : public RefCountedBase { /// /// FIXME: The signature may be confusing since `clang::Module` means to /// a module fragment or a module unit but not a C++20 module. - bool isInSameModule(const Module *M1, const Module *M2); + bool isInSameModule(const Module *M1, const Module *M2) const; TranslationUnitDecl *getTranslationUnitDecl() const { return TUDecl->getMostRecentDecl(); diff --git a/clang/include/clang/AST/DeclBase.h b/clang/include/clang/AST/DeclBase.h index 375e9e2592502..dd67ebc9873ff 100644 --- a/clang/include/clang/AST/DeclBase.h +++ b/clang/include/clang/AST/DeclBase.h @@ -646,6 +646,10 @@ class alignas(8) Decl { return getModuleOwnershipKind() == ModuleOwnershipKind::ModulePrivate; } + /// Whether this declaration was a local declaration to a C++20 + /// named module. + bool isModuleLocal() const; + /// Whether this declaration was exported in a lexical context. /// e.g.: /// diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp index b51f7622288df..4d44f23c0f503 100644 --- a/clang/lib/AST/ASTContext.cpp +++ b/clang/lib/AST/ASTContext.cpp @@ -1175,7 +1175,7 @@ void ASTContext::setCurrentNamedModule(Module *M) { CurrentCXXNamedModule = M; } -bool ASTContext::isInSameModule(const Module *M1, const Module *M2) { +bool ASTContext::isInSameModule(const Module *M1, const Module *M2) const { if (!M1 != !M2) return false; @@ -7429,6 +7429,12 @@ bool ASTContext::isSameEntity(const NamedDecl *X, const NamedDecl *Y) const { cast(Y->getDeclContext()->getRedeclContext()))) return false; + // If either X or Y are local to the owning module, they are only possible to + // be the same entity if they are in the same module. + if (X->isModuleLocal() || Y->isModuleLocal()) + if (!isInSameModule(X->getOwningModule(), Y->getOwningModule())) + return false; + // Two typedefs refer to the same entity if they have the same underlying // type. if (const auto *TypedefX = dyn_cast(X)) diff --git a/clang/lib/AST/DeclBase.cpp b/clang/lib/AST/DeclBase.cpp index a1bb62bcb68fa..48c60aa4e449a 100644 --- a/clang/lib/AST/DeclBase.cpp +++ b/clang/lib/AST/DeclBase.cpp @@ -1132,6 +1132,12 @@ bool Decl::isInExportDeclContext() const { return isa_and_nonnull(DC); } +bool Decl::isModuleLocal() const { + auto *M = getOwningModule(); + return M && M->isNamedModule() && + getModuleOwnershipKind() == ModuleOwnershipKind::ReachableWhenImported; +} + bool Decl::isInAnotherModuleUnit() const { auto *M = getOwningModule(); diff --git a/clang/test/Modules/module-local-declarations-02.cppm b/clang/test/Modules/module-local-declarations-02.cppm new file mode 100644 index 0000000000000..0670c4295abc7 --- /dev/null +++ b/clang/test/Modules/module-local-declarations-02.cppm @@ -0,0 +1,31 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm +// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fprebuilt-module-path=%t -emit-llvm -o %t/B.ll + +//--- A.cppm +export module A; + +export template +struct holder { +}; + +struct foo {}; + +export struct a { + holder m; +}; + +//--- B.cppm +// expected-no-diagnostics +export module B; + +import A; + +struct foo {}; + +struct b { + holder m; +}; \ No newline at end of file diff --git a/clang/test/Modules/pr61360.cppm b/clang/test/Modules/pr61360.cppm new file mode 100644 index 0000000000000..a16f65d4be2fe --- /dev/null +++ b/clang/test/Modules/pr61360.cppm @@ -0,0 +1,25 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/A.cppm -emit-module-interface -o %t/A.pcm +// RUN: %clang_cc1 -std=c++20 %t/B.cppm -fprebuilt-module-path=%t -emit-llvm -o %t/B.ll + +//--- A.cppm +export module A; +export template +struct holder { +}; + +struct a { + holder m; +}; + +//--- B.cppm +// expected-no-diagnostics +export module B; +import A; + +struct b { + holder m; +}; From 282e471018d234f78b0990100834532389877519 Mon Sep 17 00:00:00 2001 From: Kareem Ergawy Date: Thu, 12 Jun 2025 05:58:55 +0200 Subject: [PATCH 181/851] [flang] Erase `fir.local` ops before lowering `fir` to `llvm` (#143687) `fir.local` ops are not supposed to have any uses at this point (i.e. during lowering to LLVM). In case of serialization, the `fir.do_concurrent` users are expected to have been lowered to `fir.do_loop` nests. In case of parallelization, the `fir.do_concurrent` users are expected to have been lowered to the target parallel model (e.g. OpenMP). This hopefully resolved a build issue introduced by https://github.com/llvm/llvm-project/pull/142567 (see for example: https://lab.llvm.org/buildbot/#/builders/199/builds/4009). --- flang/lib/Optimizer/CodeGen/CodeGen.cpp | 42 +++++++++++++++++++------ flang/test/Fir/local.fir | 10 ++++++ 2 files changed, 43 insertions(+), 9 deletions(-) create mode 100644 flang/test/Fir/local.fir diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp index 82d960a6fc61e..a3de3ae9d116a 100644 --- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp +++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp @@ -3294,6 +3294,30 @@ struct LoadOpConversion : public fir::FIROpConversion { } }; +struct LocalitySpecifierOpConversion + : public fir::FIROpConversion { + using FIROpConversion::FIROpConversion; + llvm::LogicalResult + matchAndRewrite(fir::LocalitySpecifierOp localizer, OpAdaptor adaptor, + mlir::ConversionPatternRewriter &rewriter) const override { +#ifdef EXPENSIVE_CHECKS + auto uses = mlir::SymbolTable::getSymbolUses( + localizer, localizer->getParentOfType()); + + // `fir.local` ops are not supposed to have any uses at this point (i.e. + // during lowering to LLVM). In case of serialization, the + // `fir.do_concurrent` users are expected to have been lowered to + // `fir.do_loop` nests. In case of parallelization, the `fir.do_concurrent` + // users are expected to have been lowered to the target parallel model + // (e.g. OpenMP). + assert(uses && uses->empty()); +#endif + + rewriter.eraseOp(localizer); + return mlir::success(); + } +}; + /// Lower `fir.no_reassoc` to LLVM IR dialect. /// TODO: how do we want to enforce this in LLVM-IR? Can we manipulate the fast /// math flags? @@ -4249,15 +4273,15 @@ void fir::populateFIRToLLVMConversionPatterns( FieldIndexOpConversion, FirEndOpConversion, FreeMemOpConversion, GlobalLenOpConversion, GlobalOpConversion, InsertOnRangeOpConversion, IsPresentOpConversion, LenParamIndexOpConversion, LoadOpConversion, - MulcOpConversion, NegcOpConversion, NoReassocOpConversion, - SelectCaseOpConversion, SelectOpConversion, SelectRankOpConversion, - SelectTypeOpConversion, ShapeOpConversion, ShapeShiftOpConversion, - ShiftOpConversion, SliceOpConversion, StoreOpConversion, - StringLitOpConversion, SubcOpConversion, TypeDescOpConversion, - TypeInfoOpConversion, UnboxCharOpConversion, UnboxProcOpConversion, - UndefOpConversion, UnreachableOpConversion, XArrayCoorOpConversion, - XEmboxOpConversion, XReboxOpConversion, ZeroOpConversion>(converter, - options); + LocalitySpecifierOpConversion, MulcOpConversion, NegcOpConversion, + NoReassocOpConversion, SelectCaseOpConversion, SelectOpConversion, + SelectRankOpConversion, SelectTypeOpConversion, ShapeOpConversion, + ShapeShiftOpConversion, ShiftOpConversion, SliceOpConversion, + StoreOpConversion, StringLitOpConversion, SubcOpConversion, + TypeDescOpConversion, TypeInfoOpConversion, UnboxCharOpConversion, + UnboxProcOpConversion, UndefOpConversion, UnreachableOpConversion, + XArrayCoorOpConversion, XEmboxOpConversion, XReboxOpConversion, + ZeroOpConversion>(converter, options); // Patterns that are populated without a type converter do not trigger // target materializations for the operands of the root op. diff --git a/flang/test/Fir/local.fir b/flang/test/Fir/local.fir new file mode 100644 index 0000000000000..006f5ca944670 --- /dev/null +++ b/flang/test/Fir/local.fir @@ -0,0 +1,10 @@ +// RUN: fir-opt --fir-to-llvm-ir %s | FileCheck %s + +// Tests that `fir.local` ops are dropped from the module before LLVM lowering. + +fir.local {type = local} @local_privatizer : i32 +func.func @foo() { + return +} + +// CHECK-NOT: fir.local From c3be4524a56ba01bc1f868fc37e329f24ec5041c Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 11 Jun 2025 21:23:06 -0700 Subject: [PATCH 182/851] [ELF,test] Improve weak-undef-got-plt.s --- lld/test/ELF/weak-undef-got-pie.s | 22 -------------------- lld/test/ELF/weak-undef-got-plt.s | 34 +++++++++++++++++++++++++++++++ lld/test/ELF/weak-undef.s | 31 ---------------------------- 3 files changed, 34 insertions(+), 53 deletions(-) delete mode 100644 lld/test/ELF/weak-undef-got-pie.s create mode 100644 lld/test/ELF/weak-undef-got-plt.s delete mode 100644 lld/test/ELF/weak-undef.s diff --git a/lld/test/ELF/weak-undef-got-pie.s b/lld/test/ELF/weak-undef-got-pie.s deleted file mode 100644 index 2301400f4e0b1..0000000000000 --- a/lld/test/ELF/weak-undef-got-pie.s +++ /dev/null @@ -1,22 +0,0 @@ -# REQUIRES: x86 -# RUN: llvm-mc -filetype=obj -triple=x86_64 %p/Inputs/dummy-shared.s -o %t1.o -# RUN: ld.lld %t1.o -shared -o %t1.so -# RUN: llvm-mc -filetype=obj -x86-relax-relocations=false -triple=x86_64 %s -o %t.o - -# RUN: ld.lld -pie %t.o %t1.so -o %t -# RUN: llvm-readobj -r %t | FileCheck --check-prefix=RELOCS %s -# RUN: llvm-objdump -d --no-show-raw-insn %t | FileCheck --check-prefix=DISASM %s - -# RELOCS: Relocations [ -# RELOCS-NEXT: Section ({{.*}}) .rela.dyn { -# RELOCS-NEXT: R_X86_64_GLOB_DAT foo 0x0 -# RELOCS-NEXT: } -# RELOCS-NEXT: ] - -.weak foo - -.globl _start -_start: -# DISASM: <_start>: -# DISASM-NEXT: movq {{.*}}(%rip), %rax -mov foo@gotpcrel(%rip), %rax diff --git a/lld/test/ELF/weak-undef-got-plt.s b/lld/test/ELF/weak-undef-got-plt.s new file mode 100644 index 0000000000000..0ee3da2cd3b40 --- /dev/null +++ b/lld/test/ELF/weak-undef-got-plt.s @@ -0,0 +1,34 @@ +# REQUIRES: x86 +# RUN: rm -rf %t && split-file %s %t && cd %t +# RUN: llvm-mc -filetype=obj -triple=x86_64 -x86-relax-relocations=false a.s -o a.o +# RUN: llvm-mc -filetype=obj -triple=x86_64 %S/Inputs/shared.s -o s.o +# RUN: ld.lld -shared s.o -o s.so + +# RUN: ld.lld a.o -o a +# RUN: llvm-readelf -r a | FileCheck %s --check-prefix=NORELOC +# RUN: ld.lld a.o s.so -o as +# RUN: llvm-objdump -dR as | FileCheck %s + +# RUN: ld.lld -pie a.o s.so -o as.pie +# RUN: llvm-objdump -dR as.pie | FileCheck %s + +# RUN: ld.lld -shared a.o -o a.so +# RUN: llvm-objdump -dR a.so | FileCheck %s + +# NORELOC: no relocation + +# CHECK: TYPE VALUE +# CHECK-NEXT: R_X86_64_GLOB_DAT foo{{$}} +# CHECK-NEXT: R_X86_64_JUMP_SLOT foo{{$}} +# CHECK-EMPTY: +# CHECK: <_start>: +# CHECK-NEXT: movq {{.*}}(%rip), %rax +# CHECK-NEXT: callq {{.*}} + +#--- a.s +.weak foo + +.globl _start +_start: +mov foo@gotpcrel(%rip), %rax +call foo diff --git a/lld/test/ELF/weak-undef.s b/lld/test/ELF/weak-undef.s deleted file mode 100644 index 21488023a79e1..0000000000000 --- a/lld/test/ELF/weak-undef.s +++ /dev/null @@ -1,31 +0,0 @@ -# REQUIRES: x86 -# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o -# RUN: ld.lld %t.o -o %t --export-dynamic -# RUN: llvm-readelf -r --dyn-syms --hex-dump=.data %t | \ -# RUN: FileCheck %s --check-prefixes=NORELOC,COMMON - -# NORELOC: There are no relocations in this file. - -# RUN: llvm-mc -filetype=obj -triple=x86_64 %p/Inputs/dummy-shared.s -o %t1.o -# RUN: ld.lld %t1.o -shared -o %t1.so -# RUN: ld.lld %t.o -o %t %t1.so -pie -# RUN: llvm-readelf -r --dyn-syms --hex-dump=.data %t | \ -# RUN: FileCheck %s --check-prefixes=RELOC,COMMON - -# RELOC: Relocation section '.rela.dyn' at offset {{.*}} contains 1 entries: -# RELOC-NEXT: Offset Info Type Symbol's Value Symbol's Name + Addend -# RELOC-NEXT: {{.*}} 0000000100000001 R_X86_64_64 0000000000000000 foo + 0 - -# NORELOC-NOT: Symbol table '.dynsym' -# RELOC: Symbol table '.dynsym' contains 2 entries: -# RELOC-NEXT: Num: Value Size Type Bind Vis Ndx Name -# RELOC-NEXT: 0: 0000000000000000 0 NOTYPE LOCAL DEFAULT UND -# RELOC-NEXT: 1: 0000000000000000 0 NOTYPE WEAK DEFAULT UND foo -# COMMON: Hex dump of section '.data': -# COMMON-NEXT: {{.*}} 00000000 00000000 -# COMMON-EMPTY: - -.weak foo - -.data - .dc.a foo From a93e55e57ed00a55f822c64e3520c7c732b58480 Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Wed, 11 Jun 2025 21:33:46 -0700 Subject: [PATCH 183/851] Revert "[libc] Migrate stdio tests to ErrnoCheckingTest." (#143829) Reverts llvm/llvm-project#143802. Follow-up fix 3c7af175e51c3ab08ac3c442146c2b822f38c01e wasn't robust enough and itself got reverted. --- libc/test/src/stdio/CMakeLists.txt | 10 ---------- libc/test/src/stdio/fdopen_test.cpp | 14 ++++++++------ libc/test/src/stdio/fgetc_test.cpp | 5 +++-- libc/test/src/stdio/fgetc_unlocked_test.cpp | 5 +++-- libc/test/src/stdio/fgets_test.cpp | 6 +++--- libc/test/src/stdio/fileop_test.cpp | 20 +++++++++++++++----- libc/test/src/stdio/fopencookie_test.cpp | 15 ++++++++------- libc/test/src/stdio/remove_test.cpp | 10 +++++----- libc/test/src/stdio/rename_test.cpp | 9 ++++----- libc/test/src/stdio/setvbuf_test.cpp | 8 ++++---- libc/test/src/stdio/unlocked_fileop_test.cpp | 7 ++++--- libc/test/src/stdlib/StrtolTest.h | 1 + libc/test/src/stdlib/strtold_test.cpp | 1 + 13 files changed, 59 insertions(+), 52 deletions(-) diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt index 3627006ec28fd..01904a30504ed 100644 --- a/libc/test/src/stdio/CMakeLists.txt +++ b/libc/test/src/stdio/CMakeLists.txt @@ -20,7 +20,6 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fseek libc.src.stdio.fwrite - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -69,7 +68,6 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fwrite libc.src.stdio.setvbuf - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -90,7 +88,6 @@ add_libc_test( libc.src.stdio.fread_unlocked libc.src.stdio.funlockfile libc.src.stdio.fwrite_unlocked - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -112,7 +109,6 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fseek libc.src.stdio.fwrite - libc.test.UnitTest.ErrnoCheckingTest LINK_LIBRARIES LibcMemoryHelpers ) @@ -430,7 +426,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.sys.stat.mkdirat libc.src.unistd.access libc.src.unistd.close - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -445,7 +440,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.stdio.rename libc.src.unistd.access libc.src.unistd.close - libc.test.UnitTest.ErrnoCheckingTest libc.test.UnitTest.ErrnoSetterMatcher ) @@ -462,7 +456,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.stdio.fgets libc.src.stdio.fputs libc.src.unistd.close - libc.test.UnitTest.ErrnoCheckingTest libc.test.UnitTest.ErrnoSetterMatcher ) endif() @@ -483,7 +476,6 @@ add_libc_test( libc.src.stdio.fopen libc.src.stdio.fwrite libc.src.stdio.getc - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -506,7 +498,6 @@ add_libc_test( libc.src.stdio.funlockfile libc.src.stdio.fwrite libc.src.stdio.getc_unlocked - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -524,7 +515,6 @@ add_libc_test( libc.src.stdio.fgets libc.src.stdio.fopen libc.src.stdio.fwrite - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp index b53184c30be36..104fc478b100e 100644 --- a/libc/test/src/stdio/fdopen_test.cpp +++ b/libc/test/src/stdio/fdopen_test.cpp @@ -9,21 +9,20 @@ #include "src/stdio/fdopen.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/fclose.h" #include "src/stdio/fgets.h" #include "src/stdio/fputs.h" #include "src/unistd/close.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include // For S_IRWXU -using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - -TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) { +TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); @@ -53,7 +52,8 @@ TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) { ASSERT_ERRNO_SUCCESS(); } -TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) { +TEST(LlvmLibcStdioFdopenTest, InvalidFd) { + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC); @@ -64,7 +64,8 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) { ASSERT_TRUE(nullptr == fp); } -TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) { +TEST(LlvmLibcStdioFdopenTest, InvalidMode) { + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU); @@ -82,6 +83,7 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) { auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w"); ASSERT_ERRNO_EQ(EINVAL); ASSERT_TRUE(nullptr == fp2); + libc_errno = 0; LIBC_NAMESPACE::close(fd); ASSERT_ERRNO_SUCCESS(); } diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp index 7c652f666a8f3..56bde5f0099a8 100644 --- a/libc/test/src/stdio/fgetc_test.cpp +++ b/libc/test/src/stdio/fgetc_test.cpp @@ -14,12 +14,12 @@ #include "src/stdio/fopen.h" #include "src/stdio/fwrite.h" #include "src/stdio/getc.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" +#include "src/__support/libc_errno.h" -class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { +class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { public: using GetcFunc = int(FILE *); void test_with_func(GetcFunc *func, const char *filename) { @@ -33,6 +33,7 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp index f4471dd82df15..90429ecf4e82b 100644 --- a/libc/test/src/stdio/fgetc_unlocked_test.cpp +++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp @@ -17,12 +17,12 @@ #include "src/stdio/funlockfile.h" #include "src/stdio/fwrite.h" #include "src/stdio/getc_unlocked.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" +#include "src/__support/libc_errno.h" -class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { +class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { public: using GetcFunc = int(FILE *); void test_with_func(GetcFunc *func, const char *filename) { @@ -36,6 +36,7 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp index c00a9256af52d..abed3d4052939 100644 --- a/libc/test/src/stdio/fgets_test.cpp +++ b/libc/test/src/stdio/fgets_test.cpp @@ -12,12 +12,11 @@ #include "src/stdio/fgets.h" #include "src/stdio/fopen.h" #include "src/stdio/fwrite.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" -using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; +#include "src/__support/libc_errno.h" -TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) { +TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) { constexpr char FILENAME[] = "testdata/fgets.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); @@ -36,6 +35,7 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) { // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp index e097785832d56..e624181c795b8 100644 --- a/libc/test/src/stdio/fileop_test.cpp +++ b/libc/test/src/stdio/fileop_test.cpp @@ -17,18 +17,17 @@ #include "src/stdio/fread.h" #include "src/stdio/fseek.h" #include "src/stdio/fwrite.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" +#include "src/__support/libc_errno.h" -using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns; -TEST_F(LlvmLibcFILETest, SimpleFileOperations) { +TEST(LlvmLibcFILETest, SimpleFileOperations) { constexpr char FILENAME[] = "testdata/simple_operations.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); @@ -42,6 +41,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file), returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); @@ -72,6 +72,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file), returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); @@ -79,12 +80,15 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file), returns(EQ(EOF)).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file), returns(EQ(size_t(0))).with_errno(NE(0))); + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); @@ -99,8 +103,10 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); // This is not a readable file. + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file), returns(EQ(0)).with_errno(NE(0))); + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); @@ -115,18 +121,21 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { // Check that the other functions correctly set libc_errno. + // libc_errno = 0; // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0); // ASSERT_ERRNO_FAILURE(); + // libc_errno = 0; // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0); // ASSERT_ERRNO_FAILURE(); + // libc_errno = 0; // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"), // static_cast(nullptr)); // ASSERT_ERRNO_FAILURE(); } -TEST_F(LlvmLibcFILETest, FFlush) { +TEST(LlvmLibcFILETest, FFlush) { constexpr char FILENAME[] = "testdata/fflush.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+"); ASSERT_FALSE(file == nullptr); @@ -147,7 +156,7 @@ TEST_F(LlvmLibcFILETest, FFlush) { ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); } -TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { +TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { using MyStruct = struct { char c; unsigned long long i; @@ -156,6 +165,7 @@ TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct); constexpr char FILENAME[] = "testdata/fread_fwrite.test"; + libc_errno = 0; FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file)); diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp index bcf5e674141a7..03e1ac286b646 100644 --- a/libc/test/src/stdio/fopencookie_test.cpp +++ b/libc/test/src/stdio/fopencookie_test.cpp @@ -15,7 +15,6 @@ #include "src/stdio/fread.h" #include "src/stdio/fseek.h" #include "src/stdio/fwrite.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/MemoryMatcher.h" #include "test/UnitTest/Test.h" @@ -23,7 +22,6 @@ #include "hdr/types/size_t.h" #include "src/__support/libc_errno.h" -using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; using MemoryView = LIBC_NAMESPACE::testing::MemoryView; struct StringStream { @@ -90,7 +88,7 @@ int close_ss(void *cookie) { constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss, &seek_ss, &close_ss}; -TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) { +TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) { constexpr char CONTENT[] = "Hello,readonly!"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(CONTENT))); @@ -117,6 +115,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) { ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_FAILURE(); + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -125,7 +124,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) { free(ss); } -TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) { +TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) { size_t INIT_BUFSIZE = 32; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(INIT_BUFSIZE)); @@ -150,6 +149,7 @@ TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) { LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_EQ(EBADF); + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -158,7 +158,7 @@ TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) { free(ss); } -TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) { +TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) { constexpr char INITIAL_CONTENT[] = "1234567890987654321"; constexpr char WRITE_DATA[] = "append"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); @@ -178,6 +178,7 @@ TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) { ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_FAILURE(); + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -191,7 +192,7 @@ TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) { free(ss); } -TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) { +TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) { const char INITIAL_CONTENT[] = "1234567890987654321"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(INITIAL_CONTENT))); @@ -222,7 +223,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) { free(ss); } -TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) { +TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) { constexpr char WRITE_DATA[] = "hello, file"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(WRITE_DATA))); diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp index 296bff1f5dc15..84984e26398c0 100644 --- a/libc/test/src/stdio/remove_test.cpp +++ b/libc/test/src/stdio/remove_test.cpp @@ -11,17 +11,16 @@ #include "src/sys/stat/mkdirat.h" #include "src/unistd/access.h" #include "src/unistd/close.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" +#include "src/__support/libc_errno.h" #include -using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - -TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) { +TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) { // The test strategy is to create a file and remove it, and also verify that // it was removed. + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; @@ -37,9 +36,10 @@ TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) { ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT)); } -TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) { +TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) { // The test strategy is to create a dir and remove it, and also verify that // it was removed. + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; constexpr const char *FILENAME = "remove.test.dir"; diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp index 135fb98c07fbb..ac494a4ecaf8e 100644 --- a/libc/test/src/stdio/rename_test.cpp +++ b/libc/test/src/stdio/rename_test.cpp @@ -8,19 +8,18 @@ #include "include/llvm-libc-macros/linux/sys-stat-macros.h" #include "include/llvm-libc-macros/linux/unistd-macros.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/rename.h" #include "src/unistd/access.h" #include "src/unistd/close.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - -TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) { +TEST(LlvmLibcRenameTest, CreateAndRenameFile) { // The test strategy is to create a file and rename it, and also verify that // it was renamed. + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; @@ -41,7 +40,7 @@ TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) { ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT)); } -TEST_F(LlvmLibcRenameTest, RenameNonExistent) { +TEST(LlvmLibcRenameTest, RenameNonExistent) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; constexpr const char *FILENAME1 = "rename.test.file1"; diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp index 4144bc1bef447..5872943c1bb41 100644 --- a/libc/test/src/stdio/setvbuf_test.cpp +++ b/libc/test/src/stdio/setvbuf_test.cpp @@ -14,10 +14,9 @@ #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" +#include "src/__support/libc_errno.h" -using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - -TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) { +TEST(LlvmLibcSetvbufTest, SetNBFBuffer) { // The idea in this test is that we open a file for writing and reading, and // then set a NBF buffer to the write handle. Since it is NBF, the data // written using the write handle should be immediately readable by the read @@ -53,7 +52,7 @@ TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) { ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr)); } -TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) { +TEST(LlvmLibcSetvbufTest, SetLBFBuffer) { // The idea in this test is that we open a file for writing and reading, and // then set a LBF buffer to the write handle. Since it is LBF, the data // written using the write handle should be available right after a '\n' is @@ -103,5 +102,6 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) { 0); ASSERT_ERRNO_EQ(EINVAL); + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f)); } diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp index e99b382d12112..5d482b70064bd 100644 --- a/libc/test/src/stdio/unlocked_fileop_test.cpp +++ b/libc/test/src/stdio/unlocked_fileop_test.cpp @@ -15,12 +15,11 @@ #include "src/stdio/fread_unlocked.h" #include "src/stdio/funlockfile.h" #include "src/stdio/fwrite_unlocked.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" -using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; +#include "src/__support/libc_errno.h" -TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) { +TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { constexpr char fNAME[] = "testdata/unlocked_read_and_write.test"; ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w"); ASSERT_FALSE(f == nullptr); @@ -37,6 +36,7 @@ TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) { LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f)); ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0); ASSERT_ERRNO_FAILURE(); + libc_errno = 0; LIBC_NAMESPACE::clearerr_unlocked(f); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0); @@ -57,6 +57,7 @@ TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) { LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f)); ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0); ASSERT_ERRNO_FAILURE(); + libc_errno = 0; LIBC_NAMESPACE::clearerr_unlocked(f); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0); diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h index 03f0a6539c785..3eeccc5727e77 100644 --- a/libc/test/src/stdlib/StrtolTest.h +++ b/libc/test/src/stdlib/StrtolTest.h @@ -9,6 +9,7 @@ #include "src/__support/CPP/limits.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/ctype_utils.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/properties/architectures.h" #include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp index eb4056dc7ba64..c2f2b9c9a11c3 100644 --- a/libc/test/src/stdlib/strtold_test.cpp +++ b/libc/test/src/stdlib/strtold_test.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/libc_errno.h" #include "src/__support/uint128.h" #include "src/stdlib/strtold.h" From 99638537cd19b84252685a3dd56535a4d54d690e Mon Sep 17 00:00:00 2001 From: Kazu Hirata Date: Wed, 11 Jun 2025 21:56:48 -0700 Subject: [PATCH 184/851] [AArch64] Fix a warning This patch fixes: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp:7157:3: error: unannotated fall-through between switch labels [-Werror,-Wimplicit-fallthrough] --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index ad5b90984188e..af5dfd6c9b8f4 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7154,6 +7154,7 @@ SDValue AArch64TargetLowering::LowerINIT_TRAMPOLINE(SDValue Op, switch (CC) { default: NestReg = 0x0f; // X15 + LLVM_FALLTHROUGH; case CallingConv::ARM64EC_Thunk_Native: case CallingConv::ARM64EC_Thunk_X64: // Must be kept in sync with AArch64CallingConv.td From 02550da932913bd7c3987c68abc9060c9e5bde2c Mon Sep 17 00:00:00 2001 From: Fazlay Rabbi <106703039+mdfazlay@users.noreply.github.com> Date: Wed, 11 Jun 2025 22:06:11 -0700 Subject: [PATCH 185/851] [OpenMP 60] Initial parsing/sema for `need_device_addr` modifier on `adjust_args` clause (#143442) Adds initial parsing and semantic analysis for `need_device_addr` modifier on `adjust_args` clause. --- clang/include/clang/Basic/Attr.td | 1 + .../clang/Basic/DiagnosticParseKinds.td | 6 ++-- clang/include/clang/Basic/OpenMPKinds.def | 1 + clang/include/clang/Sema/SemaOpenMP.h | 1 + clang/lib/AST/AttrImpl.cpp | 6 ++++ clang/lib/Parse/ParseOpenMP.cpp | 28 +++++++++++++------ clang/lib/Sema/SemaOpenMP.cpp | 5 ++++ .../lib/Sema/SemaTemplateInstantiateDecl.cpp | 11 ++++++-- .../declare_variant_clauses_ast_print.cpp | 26 ++++++++++------- .../declare_variant_clauses_messages.cpp | 24 +++++++++++----- 10 files changed, 80 insertions(+), 29 deletions(-) diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td index 9e84462eaa660..f113cd2ba2fbf 100644 --- a/clang/include/clang/Basic/Attr.td +++ b/clang/include/clang/Basic/Attr.td @@ -4630,6 +4630,7 @@ def OMPDeclareVariant : InheritableAttr { OMPTraitInfoArgument<"TraitInfos">, VariadicExprArgument<"AdjustArgsNothing">, VariadicExprArgument<"AdjustArgsNeedDevicePtr">, + VariadicExprArgument<"AdjustArgsNeedDeviceAddr">, VariadicOMPInteropInfoArgument<"AppendArgs">, ]; let AdditionalMembers = [{ diff --git a/clang/include/clang/Basic/DiagnosticParseKinds.td b/clang/include/clang/Basic/DiagnosticParseKinds.td index 3aa36ad59d0b9..6c30da376dafb 100644 --- a/clang/include/clang/Basic/DiagnosticParseKinds.td +++ b/clang/include/clang/Basic/DiagnosticParseKinds.td @@ -1581,8 +1581,10 @@ def err_omp_unexpected_append_op : Error< "unexpected operation specified in 'append_args' clause, expected 'interop'">; def err_omp_unexpected_execution_modifier : Error< "unexpected 'execution' modifier in non-executable context">; -def err_omp_unknown_adjust_args_op : Error< - "incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'">; +def err_omp_unknown_adjust_args_op + : Error< + "incorrect 'adjust_args' type, expected 'need_device_ptr'%select{|, " + "'need_device_addr',}0 or 'nothing'">; def err_omp_declare_variant_wrong_clause : Error< "expected %select{'match'|'match', 'adjust_args', or 'append_args'}0 clause " "on 'omp declare variant' directive">; diff --git a/clang/include/clang/Basic/OpenMPKinds.def b/clang/include/clang/Basic/OpenMPKinds.def index b0de65df7e397..2b1dc1e0121b2 100644 --- a/clang/include/clang/Basic/OpenMPKinds.def +++ b/clang/include/clang/Basic/OpenMPKinds.def @@ -214,6 +214,7 @@ OPENMP_ORIGINAL_SHARING_MODIFIER(default) // Adjust-op kinds for the 'adjust_args' clause. OPENMP_ADJUST_ARGS_KIND(nothing) OPENMP_ADJUST_ARGS_KIND(need_device_ptr) +OPENMP_ADJUST_ARGS_KIND(need_device_addr) // Binding kinds for the 'bind' clause. OPENMP_BIND_KIND(teams) diff --git a/clang/include/clang/Sema/SemaOpenMP.h b/clang/include/clang/Sema/SemaOpenMP.h index 6498390fe96f7..be6bec2068784 100644 --- a/clang/include/clang/Sema/SemaOpenMP.h +++ b/clang/include/clang/Sema/SemaOpenMP.h @@ -849,6 +849,7 @@ class SemaOpenMP : public SemaBase { FunctionDecl *FD, Expr *VariantRef, OMPTraitInfo &TI, ArrayRef AdjustArgsNothing, ArrayRef AdjustArgsNeedDevicePtr, + ArrayRef AdjustArgsNeedDeviceAddr, ArrayRef AppendArgs, SourceLocation AdjustArgsLoc, SourceLocation AppendArgsLoc, SourceRange SR); diff --git a/clang/lib/AST/AttrImpl.cpp b/clang/lib/AST/AttrImpl.cpp index fefb8f55a9ee2..5875a925d3fb0 100644 --- a/clang/lib/AST/AttrImpl.cpp +++ b/clang/lib/AST/AttrImpl.cpp @@ -224,6 +224,12 @@ void OMPDeclareVariantAttr::printPrettyPragma( PrintExprs(adjustArgsNeedDevicePtr_begin(), adjustArgsNeedDevicePtr_end()); OS << ")"; } + if (adjustArgsNeedDeviceAddr_size()) { + OS << " adjust_args(need_device_addr:"; + PrintExprs(adjustArgsNeedDeviceAddr_begin(), + adjustArgsNeedDeviceAddr_end()); + OS << ")"; + } auto PrintInteropInfo = [&OS](OMPInteropInfo *Begin, OMPInteropInfo *End) { for (OMPInteropInfo *I = Begin; I != End; ++I) { diff --git a/clang/lib/Parse/ParseOpenMP.cpp b/clang/lib/Parse/ParseOpenMP.cpp index e41e5ba8596b9..b69c3abe0b321 100644 --- a/clang/lib/Parse/ParseOpenMP.cpp +++ b/clang/lib/Parse/ParseOpenMP.cpp @@ -1483,6 +1483,7 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr, OMPTraitInfo &TI = ASTCtx.getNewOMPTraitInfo(); SmallVector AdjustNothing; SmallVector AdjustNeedDevicePtr; + SmallVector AdjustNeedDeviceAddr; SmallVector AppendArgs; SourceLocation AdjustArgsLoc, AppendArgsLoc; @@ -1515,11 +1516,21 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr, SmallVector Vars; IsError = ParseOpenMPVarList(OMPD_declare_variant, OMPC_adjust_args, Vars, Data); - if (!IsError) - llvm::append_range(Data.ExtraModifier == OMPC_ADJUST_ARGS_nothing - ? AdjustNothing - : AdjustNeedDevicePtr, - Vars); + if (!IsError) { + switch (Data.ExtraModifier) { + case OMPC_ADJUST_ARGS_nothing: + llvm::append_range(AdjustNothing, Vars); + break; + case OMPC_ADJUST_ARGS_need_device_ptr: + llvm::append_range(AdjustNeedDevicePtr, Vars); + break; + case OMPC_ADJUST_ARGS_need_device_addr: + llvm::append_range(AdjustNeedDeviceAddr, Vars); + break; + default: + llvm_unreachable("Unexpected 'adjust_args' clause modifier."); + } + } break; } case OMPC_append_args: @@ -1559,8 +1570,8 @@ void Parser::ParseOMPDeclareVariantClauses(Parser::DeclGroupPtrTy Ptr, if (DeclVarData && !TI.Sets.empty()) Actions.OpenMP().ActOnOpenMPDeclareVariantDirective( DeclVarData->first, DeclVarData->second, TI, AdjustNothing, - AdjustNeedDevicePtr, AppendArgs, AdjustArgsLoc, AppendArgsLoc, - SourceRange(Loc, Tok.getLocation())); + AdjustNeedDevicePtr, AdjustNeedDeviceAddr, AppendArgs, AdjustArgsLoc, + AppendArgsLoc, SourceRange(Loc, Tok.getLocation())); // Skip the last annot_pragma_openmp_end. (void)ConsumeAnnotationToken(); @@ -4818,7 +4829,8 @@ bool Parser::ParseOpenMPVarList(OpenMPDirectiveKind DKind, getLangOpts()); Data.ExtraModifierLoc = Tok.getLocation(); if (Data.ExtraModifier == OMPC_ADJUST_ARGS_unknown) { - Diag(Tok, diag::err_omp_unknown_adjust_args_op); + Diag(Tok, diag::err_omp_unknown_adjust_args_op) + << (getLangOpts().OpenMP >= 60 ? 1 : 0); SkipUntil(tok::r_paren, tok::annot_pragma_openmp_end, StopBeforeMatch); } else { ConsumeToken(); diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp index 2cbe79c5c07ca..d928b7ae2b4c2 100644 --- a/clang/lib/Sema/SemaOpenMP.cpp +++ b/clang/lib/Sema/SemaOpenMP.cpp @@ -7122,6 +7122,7 @@ void SemaOpenMP::ActOnFinishedFunctionDefinitionInOpenMPDeclareVariantScope( getASTContext(), VariantFuncRef, DVScope.TI, /*NothingArgs=*/nullptr, /*NothingArgsSize=*/0, /*NeedDevicePtrArgs=*/nullptr, /*NeedDevicePtrArgsSize=*/0, + /*NeedDeviceAddrArgs=*/nullptr, /*NeedDeviceAddrArgsSize=*/0, /*AppendArgs=*/nullptr, /*AppendArgsSize=*/0); for (FunctionDecl *BaseFD : Bases) BaseFD->addAttr(OMPDeclareVariantA); @@ -7553,6 +7554,7 @@ void SemaOpenMP::ActOnOpenMPDeclareVariantDirective( FunctionDecl *FD, Expr *VariantRef, OMPTraitInfo &TI, ArrayRef AdjustArgsNothing, ArrayRef AdjustArgsNeedDevicePtr, + ArrayRef AdjustArgsNeedDeviceAddr, ArrayRef AppendArgs, SourceLocation AdjustArgsLoc, SourceLocation AppendArgsLoc, SourceRange SR) { @@ -7564,6 +7566,7 @@ void SemaOpenMP::ActOnOpenMPDeclareVariantDirective( SmallVector AllAdjustArgs; llvm::append_range(AllAdjustArgs, AdjustArgsNothing); llvm::append_range(AllAdjustArgs, AdjustArgsNeedDevicePtr); + llvm::append_range(AllAdjustArgs, AdjustArgsNeedDeviceAddr); if (!AllAdjustArgs.empty() || !AppendArgs.empty()) { VariantMatchInfo VMI; @@ -7614,6 +7617,8 @@ void SemaOpenMP::ActOnOpenMPDeclareVariantDirective( const_cast(AdjustArgsNothing.data()), AdjustArgsNothing.size(), const_cast(AdjustArgsNeedDevicePtr.data()), AdjustArgsNeedDevicePtr.size(), + const_cast(AdjustArgsNeedDeviceAddr.data()), + AdjustArgsNeedDeviceAddr.size(), const_cast(AppendArgs.data()), AppendArgs.size(), SR); FD->addAttr(NewAttr); } diff --git a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp index 57271415f838c..a25bfd1c48dee 100644 --- a/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp +++ b/clang/lib/Sema/SemaTemplateInstantiateDecl.cpp @@ -527,6 +527,7 @@ static void instantiateOMPDeclareVariantAttr( SmallVector NothingExprs; SmallVector NeedDevicePtrExprs; + SmallVector NeedDeviceAddrExprs; SmallVector AppendArgs; for (Expr *E : Attr.adjustArgsNothing()) { @@ -541,14 +542,20 @@ static void instantiateOMPDeclareVariantAttr( continue; NeedDevicePtrExprs.push_back(ER.get()); } + for (Expr *E : Attr.adjustArgsNeedDeviceAddr()) { + ExprResult ER = Subst(E); + if (ER.isInvalid()) + continue; + NeedDeviceAddrExprs.push_back(ER.get()); + } for (OMPInteropInfo &II : Attr.appendArgs()) { // When prefer_type is implemented for append_args handle them here too. AppendArgs.emplace_back(II.IsTarget, II.IsTargetSync); } S.OpenMP().ActOnOpenMPDeclareVariantDirective( - FD, E, TI, NothingExprs, NeedDevicePtrExprs, AppendArgs, SourceLocation(), - SourceLocation(), Attr.getRange()); + FD, E, TI, NothingExprs, NeedDevicePtrExprs, NeedDeviceAddrExprs, + AppendArgs, SourceLocation(), SourceLocation(), Attr.getRange()); } static void instantiateDependentAMDGPUFlatWorkGroupSizeAttr( diff --git a/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp b/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp index 172dd1670421d..c14e19cc8b7ec 100644 --- a/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp +++ b/clang/test/OpenMP/declare_variant_clauses_ast_print.cpp @@ -54,9 +54,9 @@ void foo_v3(float *AAA, float *BBB, int *I) {return;} //DUMP: DeclRefExpr{{.*}}Function{{.*}}foo_v1 //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'AAA' //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'BBB' -//PRINT: #pragma omp declare variant(foo_v3) match(construct={dispatch}, device={arch(x86, x86_64)}) adjust_args(nothing:I) adjust_args(need_device_ptr:BBB) +//PRINT: #pragma omp declare variant(foo_v3) match(construct={dispatch}, device={arch(x86, x86_64)}) adjust_args(nothing:I) adjust_args(need_device_ptr:BBB) adjust_args(need_device_addr:AAA) -//PRINT: #pragma omp declare variant(foo_v2) match(construct={dispatch}, device={arch(ppc)}) adjust_args(need_device_ptr:AAA) +//PRINT: #pragma omp declare variant(foo_v2) match(construct={dispatch}, device={arch(ppc)}) adjust_args(need_device_ptr:AAA) adjust_args(need_device_addr:BBB) //PRINT: omp declare variant(foo_v1) match(construct={dispatch}, device={arch(arm)}) adjust_args(need_device_ptr:AAA,BBB) @@ -66,42 +66,48 @@ void foo_v3(float *AAA, float *BBB, int *I) {return;} #pragma omp declare variant(foo_v2) \ match(construct={dispatch}, device={arch(ppc)}), \ - adjust_args(need_device_ptr:AAA) + adjust_args(need_device_ptr:AAA) \ + adjust_args(need_device_addr:BBB) #pragma omp declare variant(foo_v3) \ adjust_args(need_device_ptr:BBB) adjust_args(nothing:I) \ + adjust_args(need_device_addr:AAA) \ match(construct={dispatch}, device={arch(x86,x86_64)}) void foo(float *AAA, float *BBB, int *I) {return;} -void Foo_Var(float *AAA, float *BBB) {return;} +void Foo_Var(float *AAA, float *BBB, float *CCC) {return;} #pragma omp declare variant(Foo_Var) \ match(construct={dispatch}, device={arch(x86_64)}) \ - adjust_args(need_device_ptr:AAA) adjust_args(nothing:BBB) + adjust_args(need_device_ptr:AAA) adjust_args(nothing:BBB) \ + adjust_args(need_device_addr:CCC) template -void Foo(T *AAA, T *BBB) {return;} +void Foo(T *AAA, T *BBB, T *CCC) {return;} -//PRINT: #pragma omp declare variant(Foo_Var) match(construct={dispatch}, device={arch(x86_64)}) adjust_args(nothing:BBB) adjust_args(need_device_ptr:AAA) -//DUMP: FunctionDecl{{.*}} Foo 'void (T *, T *)' +//PRINT: #pragma omp declare variant(Foo_Var) match(construct={dispatch}, device={arch(x86_64)}) adjust_args(nothing:BBB) adjust_args(need_device_ptr:AAA) adjust_args(need_device_addr:CCC) +//DUMP: FunctionDecl{{.*}} Foo 'void (T *, T *, T *)' //DUMP: OMPDeclareVariantAttr{{.*}}device={arch(x86_64)} //DUMP: DeclRefExpr{{.*}}Function{{.*}}Foo_Var //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'BBB' //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'AAA' +//DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'CCC' // -//DUMP: FunctionDecl{{.*}} Foo 'void (float *, float *)' +//DUMP: FunctionDecl{{.*}} Foo 'void (float *, float *, float *)' //DUMP: OMPDeclareVariantAttr{{.*}}device={arch(x86_64)} //DUMP: DeclRefExpr{{.*}}Function{{.*}}Foo_Var //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'BBB' //DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'AAA' +//DUMP: DeclRefExpr{{.*}}ParmVar{{.*}}'CCC' void func() { float *A; float *B; + float *C; //#pragma omp dispatch - Foo(A, B); + Foo(A, B, C); } typedef void *omp_interop_t; diff --git a/clang/test/OpenMP/declare_variant_clauses_messages.cpp b/clang/test/OpenMP/declare_variant_clauses_messages.cpp index 284e49bbd21b4..aadded7699ea1 100644 --- a/clang/test/OpenMP/declare_variant_clauses_messages.cpp +++ b/clang/test/OpenMP/declare_variant_clauses_messages.cpp @@ -1,10 +1,10 @@ -// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -std=c++11 -o - %s -// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -std=c++11 \ +// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -std=c++11 -o - %s +// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -std=c++11 \ // RUN: -DNO_INTEROP_T_DEF -o - %s -// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -std=c++11 -o - %s -// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -Wno-strict-prototypes -DC -x c -o - %s +// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -std=c++11 -o - %s +// RUN: %clang_cc1 -verify -triple x86_64-unknown-linux -fopenmp -fopenmp-version=60 -Wno-strict-prototypes -DC -x c -o - %s // RUN: %clang_cc1 -verify -triple x86_64-pc-windows-msvc -fms-compatibility \ -// RUN: -fopenmp -Wno-strict-prototypes -DC -DWIN -x c -o - %s +// RUN: -fopenmp -fopenmp-version=60 -Wno-strict-prototypes -DC -DWIN -x c -o - %s #ifdef NO_INTEROP_T_DEF void foo_v1(float *, void *); @@ -114,6 +114,16 @@ void vararg_bar2(const char *fmt) { return; } match(construct={dispatch}, device={arch(ppc)}), \ adjust_args(need_device_ptr:AAA) adjust_args(nothing:AAA) +// expected-error@+3 {{'adjust_arg' argument 'AAA' used in multiple clauses}} +#pragma omp declare variant(foo_v1) \ + match(construct={dispatch}, device={arch(arm)}) \ + adjust_args(need_device_ptr:AAA,BBB) adjust_args(need_device_addr:AAA) + +// expected-error@+3 {{'adjust_arg' argument 'AAA' used in multiple clauses}} +#pragma omp declare variant(foo_v1) \ + match(construct={dispatch}, device={arch(ppc)}), \ + adjust_args(need_device_addr:AAA) adjust_args(nothing:AAA) + // expected-error@+2 {{use of undeclared identifier 'J'}} #pragma omp declare variant(foo_v1) \ adjust_args(nothing:J) \ @@ -186,12 +196,12 @@ void vararg_bar2(const char *fmt) { return; } // expected-error@+1 {{variant in '#pragma omp declare variant' with type 'void (float *, float *, int *, omp_interop_t)' (aka 'void (float *, float *, int *, void *)') is incompatible with type 'void (float *, float *, int *)'}} #pragma omp declare variant(foo_v4) match(construct={dispatch}) -// expected-error@+3 {{incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'}} +// expected-error@+3 {{incorrect 'adjust_args' type, expected 'need_device_ptr', 'need_device_addr', or 'nothing'}} #pragma omp declare variant(foo_v1) \ match(construct={dispatch}, device={arch(arm)}) \ adjust_args(badaaop:AAA,BBB) -// expected-error@+3 {{incorrect adjust_args type, expected 'need_device_ptr' or 'nothing'}} +// expected-error@+3 {{incorrect 'adjust_args' type, expected 'need_device_ptr', 'need_device_addr', or 'nothing'}} #pragma omp declare variant(foo_v1) \ match(construct={dispatch}, device={arch(arm)}) \ adjust_args(badaaop AAA,BBB) From 28bda778437fea17a25b561f1b3b84545612b565 Mon Sep 17 00:00:00 2001 From: Fangrui Song Date: Wed, 11 Jun 2025 22:19:31 -0700 Subject: [PATCH 186/851] Introduce MCAsmInfo::UsesSetToEquateSymbol and prefer = to .set Introduce MCAsmInfo::UsesSetToEquateSymbol to control the preferred syntax for symbol equating. We now favor the more readable and common `symbol = expression` syntax over `.set`. This aligns with pre- https://reviews.llvm.org/D44256 behavior. On Apple platforms, this resolves a clang -S vs -c behavior difference (resolves #104623). For targets whose = support is unconfirmed, UsesSetToEquateSymbol is set to false. This also minimizes test updates. Pull Request: https://github.com/llvm/llvm-project/pull/142289 --- clang/test/CodeGen/alias.c | 6 +-- llvm/include/llvm/MC/MCAsmInfo.h | 4 ++ llvm/lib/MC/MCAsmStreamer.cpp | 6 ++- .../AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp | 1 + .../Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp | 1 + .../PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp | 2 + .../SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp | 1 + llvm/test/CodeGen/AArch64/arm64ec-alias.ll | 14 +++--- .../AArch64/arm64ec-hybrid-patchable.ll | 18 +++---- llvm/test/CodeGen/AArch64/arm64ec-symbols.ll | 6 +-- llvm/test/CodeGen/AArch64/arm64ec-varargs.ll | 16 +++--- llvm/test/CodeGen/AArch64/ehcontguard.ll | 2 +- llvm/test/CodeGen/AArch64/global-merge-1.ll | 8 +-- llvm/test/CodeGen/AArch64/global-merge-2.ll | 12 ++--- llvm/test/CodeGen/AArch64/global-merge-3.ll | 10 ++-- .../AArch64/global-merge-hidden-minsize.ll | 4 +- llvm/test/CodeGen/AArch64/ifunc-asm.ll | 2 +- llvm/test/CodeGen/AArch64/seh-finally.ll | 8 +-- .../CodeGen/AArch64/stackguard-internal.ll | 2 +- llvm/test/CodeGen/ARM/alias_store.ll | 2 +- llvm/test/CodeGen/ARM/aliases.ll | 14 +++--- .../CodeGen/ARM/global-merge-dllexport.ll | 4 +- .../CodeGen/ARM/global-merge-external-2.ll | 12 ++--- .../test/CodeGen/ARM/global-merge-external.ll | 12 ++--- llvm/test/CodeGen/AVR/global-aliases.ll | 28 +++++------ llvm/test/CodeGen/Mips/hf16call32_body.ll | 24 ++++----- llvm/test/CodeGen/Mips/mips16ex.ll | 2 +- .../PowerPC/asm-printer-topological-order.ll | 6 +-- llvm/test/CodeGen/PowerPC/data-align.ll | 10 ++-- llvm/test/CodeGen/WebAssembly/aliases.ll | 22 ++++---- llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll | 2 +- llvm/test/CodeGen/WinCFGuard/cfguard.ll | 2 +- .../CodeGen/X86/2007-09-06-ExtWeakAliasee.ll | 2 +- llvm/test/CodeGen/X86/2009-08-12-badswitch.ll | 50 +++++++++---------- .../CodeGen/X86/2010-05-26-DotDebugLoc.ll | 8 +-- llvm/test/CodeGen/X86/alias-gep.ll | 8 +-- llvm/test/CodeGen/X86/aliases.ll | 8 +-- .../CodeGen/X86/catchret-empty-fallthrough.ll | 2 +- llvm/test/CodeGen/X86/coff-alias-type.ll | 2 +- llvm/test/CodeGen/X86/coff-comdat.ll | 2 +- llvm/test/CodeGen/X86/coff-feat00.ll | 2 +- llvm/test/CodeGen/X86/dllexport-x86_64.ll | 10 ++-- llvm/test/CodeGen/X86/dllexport.ll | 8 +-- llvm/test/CodeGen/X86/ehcontguard.ll | 2 +- .../CodeGen/X86/fastcall-correct-mangling.ll | 4 +- llvm/test/CodeGen/X86/ifunc-asm.ll | 2 +- .../test/CodeGen/X86/lea-opt-memop-check-1.ll | 6 +-- llvm/test/CodeGen/X86/linux-preemption.ll | 16 +++--- llvm/test/CodeGen/X86/localescape.ll | 16 +++--- llvm/test/CodeGen/X86/pr22019.ll | 8 +-- llvm/test/CodeGen/X86/seh-catch-all-win32.ll | 4 +- llvm/test/CodeGen/X86/seh-catchpad.ll | 2 +- llvm/test/CodeGen/X86/seh-finally.ll | 2 +- llvm/test/CodeGen/X86/seh-no-invokes.ll | 2 +- llvm/test/CodeGen/X86/seh-stack-realign.ll | 4 +- llvm/test/CodeGen/X86/tailcall-cgp-dup.ll | 12 ++--- .../X86/windows-seh-EHa-TryInFinally.ll | 2 +- llvm/test/CodeGen/XCore/globals.ll | 2 +- llvm/test/CodeGen/XCore/linkage.ll | 4 +- llvm/test/DebugInfo/X86/dbg-value-range.ll | 4 +- .../X86/stmt-list-multiple-compile-units.ll | 4 +- llvm/test/MC/AArch64/basic-a64-instructions.s | 2 +- llvm/test/MC/AsmParser/assignment.s | 12 ++--- llvm/test/MC/AsmParser/directive_include.s | 2 +- llvm/test/MC/AsmParser/directive_set.s | 6 +-- llvm/test/MC/AsmParser/include.ll | 4 +- llvm/test/MC/AsmParser/labels.s | 6 +-- llvm/test/MC/AsmParser/macro-arg-darwin.s | 4 +- llvm/test/MC/AsmParser/motorola_integers.s | 16 +++--- llvm/test/MC/Mips/cpsetup.s | 2 +- 70 files changed, 263 insertions(+), 252 deletions(-) diff --git a/clang/test/CodeGen/alias.c b/clang/test/CodeGen/alias.c index bc4167adf53f6..9403c55beae0b 100644 --- a/clang/test/CodeGen/alias.c +++ b/clang/test/CodeGen/alias.c @@ -29,20 +29,20 @@ const int wacom_usb_ids[] = {1, 1, 2, 3, 5, 8, 13, 0}; extern const int __mod_usb_device_table __attribute__ ((alias("wacom_usb_ids"))); // CHECKBASIC-DAG: @__mod_usb_device_table ={{.*}} alias i32, ptr @wacom_usb_ids // CHECKASM-DAG: .globl __mod_usb_device_table -// CHECKASM-DAG: .set __mod_usb_device_table, wacom_usb_ids +// CHECKASM-DAG: __mod_usb_device_table = wacom_usb_ids // CHECKASM-NOT: .size __mod_usb_device_table extern int g1; extern int g1 __attribute((alias("g0"))); // CHECKBASIC-DAG: @g1 ={{.*}} alias i32, ptr @g0 // CHECKASM-DAG: .globl g1 -// CHECKASM-DAG: .set g1, g0 +// CHECKASM-DAG: g1 = g0 // CHECKASM-NOT: .size g1 extern __thread int __libc_errno __attribute__ ((alias ("TL_WITH_ALIAS"))); // CHECKBASIC-DAG: @__libc_errno ={{.*}} thread_local alias i32, ptr @TL_WITH_ALIAS // CHECKASM-DAG: .globl __libc_errno -// CHECKASM-DAG: .set __libc_errno, TL_WITH_ALIAS +// CHECKASM-DAG: __libc_errno = TL_WITH_ALIAS // CHECKASM-NOT: .size __libc_errno void f0(void) { } diff --git a/llvm/include/llvm/MC/MCAsmInfo.h b/llvm/include/llvm/MC/MCAsmInfo.h index 4eb50344d6384..e98cd17a9df50 100644 --- a/llvm/include/llvm/MC/MCAsmInfo.h +++ b/llvm/include/llvm/MC/MCAsmInfo.h @@ -141,6 +141,9 @@ class LLVM_ABI MCAsmInfo { /// This is appended to emitted labels. Defaults to ":" const char *LabelSuffix; + /// Use .set instead of = to equate a symbol to an expression. + bool UsesSetToEquateSymbol = false; + // Print the EH begin symbol with an assignment. Defaults to false. bool UseAssignmentForEHBegin = false; @@ -525,6 +528,7 @@ class LLVM_ABI MCAsmInfo { bool shouldAllowAdditionalComments() const { return AllowAdditionalComments; } const char *getLabelSuffix() const { return LabelSuffix; } + bool usesSetToEquateSymbol() const { return UsesSetToEquateSymbol; } bool useAssignmentForEHBegin() const { return UseAssignmentForEHBegin; } bool needsLocalForSize() const { return NeedsLocalForSize; } StringRef getPrivateGlobalPrefix() const { return PrivateGlobalPrefix; } diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp index da0d99e70d9ea..4380f74318e7b 100644 --- a/llvm/lib/MC/MCAsmStreamer.cpp +++ b/llvm/lib/MC/MCAsmStreamer.cpp @@ -695,9 +695,11 @@ void MCAsmStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) { if (E->inlineAssignedExpr()) EmitSet = false; if (EmitSet) { - OS << ".set "; + bool UseSet = MAI->usesSetToEquateSymbol(); + if (UseSet) + OS << ".set "; Symbol->print(OS, MAI); - OS << ", "; + OS << (UseSet ? ", " : " = "); Value->print(OS, MAI); EmitEOL(); diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp index 6f1d89e500ed3..fcf134aa8658f 100644 --- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp +++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp @@ -42,6 +42,7 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT, CommentString = ";"; InlineAsmStart = ";#ASMSTART"; InlineAsmEnd = ";#ASMEND"; + UsesSetToEquateSymbol = true; //===--- Data Emission Directives -------------------------------------===// UsesELFSectionDirectiveForBSS = true; diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp index 7675b05f106a0..ba8faaeb74a07 100644 --- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp +++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp @@ -38,6 +38,7 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) { LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment; InlineAsmStart = "# InlineAsm Start"; InlineAsmEnd = "# InlineAsm End"; + UsesSetToEquateSymbol = true; ZeroDirective = "\t.space\t"; AscizDirective = "\t.string\t"; diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp index 160ee07fad5cc..b5be23c5a96ad 100644 --- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp +++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp @@ -155,5 +155,7 @@ PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) { // Support $ as PC in inline asm DollarIsPC = true; + UsesSetToEquateSymbol = true; + initializeVariantKinds(variantKindDescs); } diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp index 27272cdbbd230..e9d387399bf30 100644 --- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp +++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp @@ -49,6 +49,7 @@ SystemZMCAsmInfoGOFF::SystemZMCAsmInfoGOFF(const Triple &TT) { CalleeSaveStackSlotSize = 8; CodePointerSize = 8; CommentString = "*"; + UsesSetToEquateSymbol = true; ExceptionsType = ExceptionHandling::ZOS; IsHLASM = true; IsLittleEndian = false; diff --git a/llvm/test/CodeGen/AArch64/arm64ec-alias.ll b/llvm/test/CodeGen/AArch64/arm64ec-alias.ll index 03cc873136940..18023a95a5d20 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-alias.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-alias.ll @@ -13,30 +13,30 @@ define dso_local void @patchable_func() hybrid_patchable { @patchable_alias = alias void (), ptr @patchable_func ; CHECK: .weak_anti_dep func_alias -; CHECK-NEXT: .set func_alias, "#func_alias" +; CHECK-NEXT: func_alias = "#func_alias" ; CHECK-NEXT: .weak_anti_dep func_alias2 -; CHECK-NEXT: .set func_alias2, "#func_alias2" +; CHECK-NEXT: func_alias2 = "#func_alias2" ; CHECK-NEXT: .weak_anti_dep func -; CHECK-NEXT: .set func, "#func" +; CHECK-NEXT: func = "#func" ; CHECK: .weak_anti_dep patchable_alias -; CHECK-NEXT: .set patchable_alias, "#patchable_alias" +; CHECK-NEXT: patchable_alias = "#patchable_alias" ; CHECK: .globl "#func_alias" ; CHECK-NEXT: .def "#func_alias"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef -; CHECK-NEXT: .set "#func_alias", "#func" +; CHECK-NEXT: "#func_alias" = "#func" ; CHECK-NEXT: .globl "#func_alias2" ; CHECK-NEXT: .def "#func_alias2"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef -; CHECK-NEXT: .set "#func_alias2", "#func_alias" +; CHECK-NEXT: "#func_alias2" = "#func_alias" ; CHECK: .globl "#patchable_alias" ; CHECK-NEXT: .def "#patchable_alias"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef -; CHECK-NEXT: .set "#patchable_alias", "#patchable_func" +; CHECK-NEXT: "#patchable_alias" = "#patchable_func" diff --git a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll index f964484c0c2d4..7c77832a9d9a5 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-hybrid-patchable.ll @@ -76,7 +76,7 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: "#caller": // @"#caller" ; CHECK-NEXT: .weak_anti_dep caller -; CHECK-NEXT: .set caller, "#caller"{{$}} +; CHECK-NEXT: caller = "#caller"{{$}} ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill ; CHECK-NEXT: bl "#func" @@ -253,13 +253,13 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef ; CHECK-NEXT: .weak func -; CHECK-NEXT: .set func, "EXP+#func"{{$}} +; CHECK-NEXT: func = "EXP+#func"{{$}} ; CHECK-NEXT: .weak "#func" ; CHECK-NEXT: .def "#func"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef -; CHECK-NEXT: .set "#func", "#func$hybpatch_thunk"{{$}} +; CHECK-NEXT: "#func" = "#func$hybpatch_thunk"{{$}} ; CHECK-NEXT: .def "EXP+#has_varargs"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; @@ -269,13 +269,13 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef ; CHECK-NEXT: .weak has_varargs -; CHECK-NEXT: .set has_varargs, "EXP+#has_varargs" +; CHECK-NEXT: has_varargs = "EXP+#has_varargs" ; CHECK-NEXT: .weak "#has_varargs" ; CHECK-NEXT: .def "#has_varargs"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef -; CHECK-NEXT: .set "#has_varargs", "#has_varargs$hybpatch_thunk" +; CHECK-NEXT: "#has_varargs" = "#has_varargs$hybpatch_thunk" ; CHECK-NEXT: .def "EXP+#has_sret"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; @@ -285,13 +285,13 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef ; CHECK-NEXT: .weak has_sret -; CHECK-NEXT: .set has_sret, "EXP+#has_sret" +; CHECK-NEXT: has_sret = "EXP+#has_sret" ; CHECK-NEXT: .weak "#has_sret" ; CHECK-NEXT: .def "#has_sret"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef -; CHECK-NEXT: .set "#has_sret", "#has_sret$hybpatch_thunk" +; CHECK-NEXT: "#has_sret" = "#has_sret$hybpatch_thunk" ; CHECK-NEXT: .def "EXP+#exp"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; @@ -301,13 +301,13 @@ define dso_local void @caller() nounwind { ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef ; CHECK-NEXT: .weak exp -; CHECK-NEXT: .set exp, "EXP+#exp" +; CHECK-NEXT: exp = "EXP+#exp" ; CHECK-NEXT: .weak "#exp" ; CHECK-NEXT: .def "#exp"; ; CHECK-NEXT: .scl 2; ; CHECK-NEXT: .type 32; ; CHECK-NEXT: .endef -; CHECK-NEXT: .set "#exp", "#exp$hybpatch_thunk" +; CHECK-NEXT: "#exp" = "#exp$hybpatch_thunk" ; SYM: [53](sec 15)(fl 0x00)(ty 20)(scl 2) (nx 0) 0x00000000 #func$hybpatch_thunk ; SYM: [58](sec 16)(fl 0x00)(ty 20)(scl 2) (nx 0) 0x00000000 #has_varargs$hybpatch_thunk diff --git a/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll b/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll index b79dd7d61dd60..b44f39ad7b735 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-symbols.ll @@ -10,12 +10,12 @@ define void @caller() nounwind { } ; CHECK: .weak_anti_dep caller -; CHECK-NEXT: .set caller, "#caller"{{$}} +; CHECK-NEXT: caller = "#caller"{{$}} ; CHECK: .weak_anti_dep func -; CHECK-NEXT: .set func, "#func"{{$}} +; CHECK-NEXT: func = "#func"{{$}} ; CHECK-NEXT: .weak_anti_dep "#func" -; CHECK-NEXT: .set "#func", "#func$exit_thunk"{{$}} +; CHECK-NEXT: "#func" = "#func$exit_thunk"{{$}} ; SYM: [ 8](sec 4)(fl 0x00)(ty 20)(scl 2) (nx 0) 0x00000000 #caller ; SYM: [21](sec 7)(fl 0x00)(ty 20)(scl 2) (nx 0) 0x00000000 #func$exit_thunk diff --git a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll index 5fab5738078dc..389969bebaea4 100644 --- a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll +++ b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll @@ -45,9 +45,9 @@ define void @varargs_caller() nounwind { ; CHECK-NEXT: stp x9, x8, [sp] ; CHECK-NEXT: str xzr, [sp, #16] ; CHECK-NEXT: .weak_anti_dep varargs_callee -; CHECK-NEXT: .set varargs_callee, "#varargs_callee" +; CHECK-NEXT: varargs_callee = "#varargs_callee" ; CHECK-NEXT: .weak_anti_dep "#varargs_callee" -; CHECK-NEXT: .set "#varargs_callee", varargs_callee +; CHECK-NEXT: "#varargs_callee" = varargs_callee ; CHECK-NEXT: bl "#varargs_callee" ; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: add sp, sp, #48 @@ -86,9 +86,9 @@ define void @varargs_many_argscalleer() nounwind { ; CHECK-NEXT: stp x9, x8, [sp] ; CHECK-NEXT: stp q0, q0, [sp, #16] ; CHECK-NEXT: .weak_anti_dep varargs_many_argscallee -; CHECK-NEXT: .set varargs_many_argscallee, "#varargs_many_argscallee" +; CHECK-NEXT: varargs_many_argscallee = "#varargs_many_argscallee" ; CHECK-NEXT: .weak_anti_dep "#varargs_many_argscallee" -; CHECK-NEXT: .set "#varargs_many_argscallee", varargs_many_argscallee +; CHECK-NEXT: "#varargs_many_argscallee" = varargs_many_argscallee ; CHECK-NEXT: bl "#varargs_many_argscallee" ; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload ; CHECK-NEXT: add sp, sp, #64 @@ -116,9 +116,9 @@ define void @varargs_caller_tail() nounwind { ; CHECK-NEXT: stp x9, x8, [sp] ; CHECK-NEXT: str xzr, [sp, #16] ; CHECK-NEXT: .weak_anti_dep varargs_callee -; CHECK-NEXT: .set varargs_callee, "#varargs_callee" +; CHECK-NEXT: varargs_callee = "#varargs_callee" ; CHECK-NEXT: .weak_anti_dep "#varargs_callee" -; CHECK-NEXT: .set "#varargs_callee", varargs_callee +; CHECK-NEXT: "#varargs_callee" = varargs_callee ; CHECK-NEXT: bl "#varargs_callee" ; CHECK-NEXT: ldr x30, [sp, #32] // 8-byte Folded Reload ; CHECK-NEXT: add x4, sp, #48 @@ -129,9 +129,9 @@ define void @varargs_caller_tail() nounwind { ; CHECK-NEXT: mov x5, xzr ; CHECK-NEXT: add sp, sp, #48 ; CHECK-NEXT: .weak_anti_dep varargs_callee -; CHECK-NEXT: .set varargs_callee, "#varargs_callee" +; CHECK-NEXT: varargs_callee = "#varargs_callee" ; CHECK-NEXT: .weak_anti_dep "#varargs_callee" -; CHECK-NEXT: .set "#varargs_callee", varargs_callee +; CHECK-NEXT: "#varargs_callee" = varargs_callee ; CHECK-NEXT: b "#varargs_callee" call void (double, ...) @varargs_callee(double 1.0, i32 2, double 3.0, i32 4, double 5.0, <2 x double> ) tail call void (double, ...) @varargs_callee(double 1.0, i32 4, i32 3, i32 2) diff --git a/llvm/test/CodeGen/AArch64/ehcontguard.ll b/llvm/test/CodeGen/AArch64/ehcontguard.ll index eecff391d0f8c..cb603a482d228 100644 --- a/llvm/test/CodeGen/AArch64/ehcontguard.ll +++ b/llvm/test/CodeGen/AArch64/ehcontguard.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=aarch64-windows | FileCheck %s ; EHCont Guard is currently only available on Windows -; CHECK: .set "@feat.00", 16384 +; CHECK: "@feat.00" = 16384 ; CHECK: .section .gehcont$y diff --git a/llvm/test/CodeGen/AArch64/global-merge-1.ll b/llvm/test/CodeGen/AArch64/global-merge-1.ll index cc17e344c211a..626310fc4ec25 100644 --- a/llvm/test/CodeGen/AArch64/global-merge-1.ll +++ b/llvm/test/CodeGen/AArch64/global-merge-1.ll @@ -23,9 +23,9 @@ define void @f1(i32 %a1, i32 %a2) { ;CHECK: .type .L_MergedGlobals,@object // @_MergedGlobals ;CHECK: .local .L_MergedGlobals ;CHECK: .comm .L_MergedGlobals,8,4 -;CHECK: .set m, .L_MergedGlobals -;CHECK: .set n, .L_MergedGlobals+4 +;CHECK: m = .L_MergedGlobals +;CHECK: n = .L_MergedGlobals+4 ;CHECK-APPLE-IOS: .zerofill __DATA,__bss,__MergedGlobals,8,2 ; @_MergedGlobals -;CHECK-APPLE-IOS-NOT: .set _m, l__MergedGlobals -;CHECK-APPLE-IOS-NOT: .set _n, l__MergedGlobals+4 +;CHECK-APPLE-IOS-NOT: _m = l__MergedGlobals +;CHECK-APPLE-IOS-NOT: _n = l__MergedGlobals+4 diff --git a/llvm/test/CodeGen/AArch64/global-merge-2.ll b/llvm/test/CodeGen/AArch64/global-merge-2.ll index 85d814c3177b3..1b5333b907d27 100644 --- a/llvm/test/CodeGen/AArch64/global-merge-2.ll +++ b/llvm/test/CodeGen/AArch64/global-merge-2.ll @@ -32,21 +32,21 @@ define dso_local void @g1(i32 %a1, i32 %a2) { ;CHECK: .comm .L_MergedGlobals,12,4 ;CHECK: .globl x -;CHECK: .set x, .L_MergedGlobals +;CHECK: x = .L_MergedGlobals ;CHECK: .size x, 4 ;CHECK: .globl y -;CHECK: .set y, .L_MergedGlobals+4 +;CHECK: y = .L_MergedGlobals+4 ;CHECK: .size y, 4 ;CHECK: .globl z -;CHECK: .set z, .L_MergedGlobals+8 +;CHECK: z = .L_MergedGlobals+8 ;CHECK: .size z, 4 ;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_x,12,2 ;CHECK-APPLE-IOS: .globl _x -;CHECK-APPLE-IOS: .set {{.*}}, __MergedGlobals_x +;CHECK-APPLE-IOS: {{.*}} = __MergedGlobals_x ;CHECK-APPLE-IOS: .globl _y -;CHECK-APPLE-IOS: .set _y, __MergedGlobals_x+4 +;CHECK-APPLE-IOS: _y = __MergedGlobals_x+4 ;CHECK-APPLE-IOS: .globl _z -;CHECK-APPLE-IOS: .set _z, __MergedGlobals_x+8 +;CHECK-APPLE-IOS: _z = __MergedGlobals_x+8 ;CHECK-APPLE-IOS: .subsections_via_symbols diff --git a/llvm/test/CodeGen/AArch64/global-merge-3.ll b/llvm/test/CodeGen/AArch64/global-merge-3.ll index b3f58887139f7..2a0ae12274556 100644 --- a/llvm/test/CodeGen/AArch64/global-merge-3.ll +++ b/llvm/test/CodeGen/AArch64/global-merge-3.ll @@ -40,14 +40,14 @@ define dso_local void @f1(i32 %a1, i32 %a2, i32 %a3) { ;CHECK-APPLE-IOS: .globl __MergedGlobals_x ;CHECK-APPLE-IOS: .zerofill __DATA,__common,__MergedGlobals_x,800,2 -;CHECK-APPLE-IOS: .set _x, __MergedGlobals_x -;CHECK-APPLE-IOS: .set _y, __MergedGlobals_x+400 +;CHECK-APPLE-IOS: _x = __MergedGlobals_x +;CHECK-APPLE-IOS: _y = __MergedGlobals_x+400 ;CHECK: .type .L_MergedGlobals,@object // @_MergedGlobals ;CHECK: .local .L_MergedGlobals ;CHECK: .comm .L_MergedGlobals,800,4 ;CHECK: globl x -;CHECK: .set x, .L_MergedGlobals +;CHECK: x = .L_MergedGlobals ;CHECK: globl y -;CHECK: .set y, .L_MergedGlobals+400 -;CHECK-NOT: .set z, .L_MergedGlobals +;CHECK: y = .L_MergedGlobals+400 +;CHECK-NOT: z = .L_MergedGlobals diff --git a/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll b/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll index 9c694fc4d289c..5292aa91fc381 100644 --- a/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll +++ b/llvm/test/CodeGen/AArch64/global-merge-hidden-minsize.ll @@ -16,10 +16,10 @@ attributes #0 = { minsize optsize } ; CHECK: .globl x ; CHECK: .hidden x -; CHECK: .set x, .L_MergedGlobals +; CHECK: x = .L_MergedGlobals ; CHECK: .size x, 4 ; CHECK: .globl y ; CHECK: .hidden y -; CHECK: .set y, .L_MergedGlobals+4 +; CHECK: y = .L_MergedGlobals+4 ; CHECK: .size y, 4 diff --git a/llvm/test/CodeGen/AArch64/ifunc-asm.ll b/llvm/test/CodeGen/AArch64/ifunc-asm.ll index 57fc2f0c9d7f5..7aad6cce09cf2 100644 --- a/llvm/test/CodeGen/AArch64/ifunc-asm.ll +++ b/llvm/test/CodeGen/AArch64/ifunc-asm.ll @@ -16,7 +16,7 @@ entry: @global_ifunc = ifunc i32 (i32), ptr @the_resolver ; ELF: .globl global_ifunc ; ELF-NEXT: .type global_ifunc,@gnu_indirect_function -; ELF-NEXT: .set global_ifunc, the_resolver +; ELF-NEXT: global_ifunc = the_resolver ; MACHO: .section __DATA,__data ; MACHO-NEXT: .p2align 3, 0x0 diff --git a/llvm/test/CodeGen/AArch64/seh-finally.ll b/llvm/test/CodeGen/AArch64/seh-finally.ll index 04a30800d9294..fd6b3fd0bc1fc 100644 --- a/llvm/test/CodeGen/AArch64/seh-finally.ll +++ b/llvm/test/CodeGen/AArch64/seh-finally.ll @@ -38,7 +38,7 @@ entry: ; CHECK: add x29, sp, #16 ; CHECK: mov x0, #-2 ; CHECK: stur x0, [x29, #16] -; CHECK: .set .Lsimple_seh$frame_escape_0, -8 +; CHECK: .Lsimple_seh$frame_escape_0 = -8 ; CHECK: ldur w0, [x29, #-8] ; CHECK: bl foo @@ -89,7 +89,7 @@ entry: ; CHECK: mov x19, sp ; CHECK: mov x0, #-2 ; CHECK: stur x0, [x29, #24] -; CHECK: .set .Lstack_realign$frame_escape_0, 0 +; CHECK: .Lstack_realign$frame_escape_0 = 0 ; CHECK: ldr w0, [x19] ; CHECK: bl foo @@ -137,7 +137,7 @@ entry: ; CHECK: add x29, sp, #32 ; CHECK: mov x1, #-2 ; CHECK: stur x1, [x29, #16] -; CHECK: .set .Lvla_present$frame_escape_0, -4 +; CHECK: .Lvla_present$frame_escape_0 = -4 ; CHECK: stur w0, [x29, #-4] ; CHECK: ldur w8, [x29, #-4] ; CHECK: mov x9, sp @@ -204,7 +204,7 @@ entry: ; CHECK: mov x19, sp ; CHECK: mov x1, #-2 ; CHECK: stur x1, [x29, #24] -; CHECK: .set .Lvla_and_realign$frame_escape_0, 32 +; CHECK: .Lvla_and_realign$frame_escape_0 = 32 ; CHECK: str w0, [x29, #36] ; CHECK: ldr w8, [x29, #36] ; CHECK: mov x9, sp diff --git a/llvm/test/CodeGen/AArch64/stackguard-internal.ll b/llvm/test/CodeGen/AArch64/stackguard-internal.ll index a70c8874edbac..7b32e8c0caab5 100644 --- a/llvm/test/CodeGen/AArch64/stackguard-internal.ll +++ b/llvm/test/CodeGen/AArch64/stackguard-internal.ll @@ -6,7 +6,7 @@ target triple = "aarch64-linux-gnu" ; is an alias. (The alias is created by GlobalMerge.) ; CHECK: adrp {{.*}}, __stack_chk_guard ; CHECK: ldr {{.*}}, [{{.*}}, :lo12:__stack_chk_guard] -; CHECK: .set __stack_chk_guard, .L_MergedGlobals+4 +; CHECK: __stack_chk_guard = .L_MergedGlobals+4 @__stack_chk_guard = internal global [8 x i32] zeroinitializer, align 4 @x = internal global i32 0, align 4 diff --git a/llvm/test/CodeGen/ARM/alias_store.ll b/llvm/test/CodeGen/ARM/alias_store.ll index c6612334eaf1b..60aa58d37499c 100644 --- a/llvm/test/CodeGen/ARM/alias_store.ll +++ b/llvm/test/CodeGen/ARM/alias_store.ll @@ -13,4 +13,4 @@ entry: ; CHECK: ldr r{{.*}}, [[L:.*]] ; CHECK: [[L]]: ; CHECK-NEXT: .long XA -; CHECK: .set XA, X+1 +; CHECK: XA = X+1 diff --git a/llvm/test/CodeGen/ARM/aliases.ll b/llvm/test/CodeGen/ARM/aliases.ll index 6075ad813e990..8d9f938155d15 100644 --- a/llvm/test/CodeGen/ARM/aliases.ll +++ b/llvm/test/CodeGen/ARM/aliases.ll @@ -6,30 +6,30 @@ ; CHECK: .size .Lstructvar, 8 ; CHECK: .globl foo1 -; CHECK: .set foo1, bar +; CHECK: foo1 = bar ; CHECK-NOT: .size foo1 ; CHECK: .globl foo2 -; CHECK: .set foo2, bar +; CHECK: foo2 = bar ; CHECK-NOT: .size foo2 ; CHECK: .weak bar_f -; CHECK: .set bar_f, foo_f +; CHECK: bar_f = foo_f ; CHECK-NOT: .size bar_f -; CHECK: .set bar_i, bar +; CHECK: bar_i = bar ; CHECK-NOT: .size bar_i ; CHECK: .globl A -; CHECK: .set A, bar +; CHECK: A = bar ; CHECK-NOT: .size A ; CHECK: .globl elem0 -; CHECK: .set elem0, .Lstructvar +; CHECK: elem0 = .Lstructvar ; CHECK: .size elem0, 4 ; CHECK: .globl elem1 -; CHECK: .set elem1, .Lstructvar+4 +; CHECK: elem1 = .Lstructvar+4 ; CHECK: .size elem1, 4 @bar = global i32 42 diff --git a/llvm/test/CodeGen/ARM/global-merge-dllexport.ll b/llvm/test/CodeGen/ARM/global-merge-dllexport.ll index 89e8a859b9393..f5961d7f79e3d 100644 --- a/llvm/test/CodeGen/ARM/global-merge-dllexport.ll +++ b/llvm/test/CodeGen/ARM/global-merge-dllexport.ll @@ -16,6 +16,6 @@ define void @f1(i32 %a1, i32 %a2) { ; CHECK: .section .drectve,"yni" ; CHECK: .ascii " /EXPORT:y,DATA" ; CHECK: .globl x -; CHECK: .set x, .L_MergedGlobals +; CHECK: x = .L_MergedGlobals ; CHECK: .globl y -; CHECK: .set y, .L_MergedGlobals+4 +; CHECK: y = .L_MergedGlobals+4 diff --git a/llvm/test/CodeGen/ARM/global-merge-external-2.ll b/llvm/test/CodeGen/ARM/global-merge-external-2.ll index 602533e045e0b..c9e92d98e4841 100644 --- a/llvm/test/CodeGen/ARM/global-merge-external-2.ll +++ b/llvm/test/CodeGen/ARM/global-merge-external-2.ll @@ -50,16 +50,16 @@ define dso_local void @g1(i32 %a1, i32 %a2) { ;CHECK-WIN32: .lcomm .L_MergedGlobals,8,4 ;CHECK-MERGE: .globl x -;CHECK-MERGE: .set x, .L_MergedGlobals +;CHECK-MERGE: x = .L_MergedGlobals ;CHECK-MERGE: .size x, 4 ;CHECK-MERGE: .globl y -;CHECK-MERGE: .set y, .L_MergedGlobals+4 +;CHECK-MERGE: y = .L_MergedGlobals+4 ;CHECK-MERGE: .size y, 4 -;CHECK-MERGE-NOT: .set z, .L_MergedGlobals+8 +;CHECK-MERGE-NOT: z = .L_MergedGlobals+8 ;CHECK-WIN32: .globl x -;CHECK-WIN32: .set x, .L_MergedGlobals +;CHECK-WIN32: x = .L_MergedGlobals ;CHECK-WIN32: .globl y -;CHECK-WIN32: .set y, .L_MergedGlobals+4 -;CHECK-WIN32-NOT: .set z, .L_MergedGlobals+8 +;CHECK-WIN32: y = .L_MergedGlobals+4 +;CHECK-WIN32-NOT: z = .L_MergedGlobals+8 diff --git a/llvm/test/CodeGen/ARM/global-merge-external.ll b/llvm/test/CodeGen/ARM/global-merge-external.ll index 364659b36bb9a..4fe1914aae351 100644 --- a/llvm/test/CodeGen/ARM/global-merge-external.ll +++ b/llvm/test/CodeGen/ARM/global-merge-external.ll @@ -45,18 +45,18 @@ define dso_local void @g1(i32 %a1, i32 %a2) { ;CHECK-WIN32: .lcomm .L_MergedGlobals,12,4 ;CHECK-MERGE: .globl x -;CHECK-MERGE: .set x, .L_MergedGlobals +;CHECK-MERGE: x = .L_MergedGlobals ;CHECK-MERGE: .size x, 4 ;CHECK-MERGE: .globl y -;CHECK-MERGE: .set y, .L_MergedGlobals+4 +;CHECK-MERGE: y = .L_MergedGlobals+4 ;CHECK-MERGE: .size y, 4 ;CHECK-MERGE: .globl z -;CHECK-MERGE: .set z, .L_MergedGlobals+8 +;CHECK-MERGE: z = .L_MergedGlobals+8 ;CHECK-MERGE: .size z, 4 ;CHECK-WIN32: .globl x -;CHECK-WIN32: .set x, .L_MergedGlobals +;CHECK-WIN32: x = .L_MergedGlobals ;CHECK-WIN32: .globl y -;CHECK-WIN32: .set y, .L_MergedGlobals+4 +;CHECK-WIN32: y = .L_MergedGlobals+4 ;CHECK-WIN32: .globl z -;CHECK-WIN32: .set z, .L_MergedGlobals+8 +;CHECK-WIN32: z = .L_MergedGlobals+8 diff --git a/llvm/test/CodeGen/AVR/global-aliases.ll b/llvm/test/CodeGen/AVR/global-aliases.ll index 91bcedc7e0dba..b948003e8b88d 100644 --- a/llvm/test/CodeGen/AVR/global-aliases.ll +++ b/llvm/test/CodeGen/AVR/global-aliases.ll @@ -1,18 +1,18 @@ ; RUN: llc < %s -mtriple=avr -mcpu=atxmega384c3 | FileCheck %s --check-prefixes=MEGA ; RUN: llc < %s -mtriple=avr -mcpu=attiny40 | FileCheck %s --check-prefixes=TINY -; MEGA: .set __tmp_reg__, 0 -; MEGA: .set __zero_reg__, 1 -; MEGA: .set __SREG__, 63 -; MEGA: .set __SP_H__, 62 -; MEGA: .set __SP_L__, 61 -; MEGA: .set __EIND__, 60 -; MEGA: .set __RAMPZ__, 59 +; MEGA: __tmp_reg__ = 0 +; MEGA: __zero_reg__ = 1 +; MEGA: __SREG__ = 63 +; MEGA: __SP_H__ = 62 +; MEGA: __SP_L__ = 61 +; MEGA: __EIND__ = 60 +; MEGA: __RAMPZ__ = 59 -; TINY: .set __tmp_reg__, 16 -; TINY: .set __zero_reg__, 17 -; TINY: .set __SREG__, 63 -; TINY-NOT: .set __SP_H__, 62 -; TINY: .set __SP_L__, 61 -; TINY-NOT: .set __EIND__, 60 -; TINY-NOT: .set __RAMPZ__, 59 +; TINY: __tmp_reg__ = 16 +; TINY: __zero_reg__ = 17 +; TINY: __SREG__ = 63 +; TINY-NOT: __SP_H__ = 62 +; TINY: __SP_L__ = 61 +; TINY-NOT: __EIND__ = 60 +; TINY-NOT: __RAMPZ__ = 59 diff --git a/llvm/test/CodeGen/Mips/hf16call32_body.ll b/llvm/test/CodeGen/Mips/hf16call32_body.ll index ea83f776bd40f..3bcb6f6bc0152 100644 --- a/llvm/test/CodeGen/Mips/hf16call32_body.ll +++ b/llvm/test/CodeGen/Mips/hf16call32_body.ll @@ -24,7 +24,7 @@ entry: ; stel: addiu $25, $25, %lo(v_sf) ; stel: mfc1 $4, $f12 ; stel: jr $25 -; stel: .set $__fn_local_v_sf, v_sf +; stel: $__fn_local_v_sf = v_sf ; stel: .end __fn_stub_v_sf declare i32 @printf(ptr, ...) #1 @@ -46,7 +46,7 @@ entry: ; stel: mfc1 $4, $f12 ; stel: mfc1 $5, $f13 ; stel: jr $25 -; stel: .set $__fn_local_v_df, v_df +; stel: $__fn_local_v_df = v_df ; stel: .end __fn_stub_v_df ; Function Attrs: nounwind @@ -70,7 +70,7 @@ entry: ; stel: mfc1 $4, $f12 ; stel: mfc1 $5, $f14 ; stel: jr $25 -; stel: .set $__fn_local_v_sf_sf, v_sf_sf +; stel: $__fn_local_v_sf_sf = v_sf_sf ; stel: .end __fn_stub_v_sf_sf ; Function Attrs: nounwind @@ -95,7 +95,7 @@ entry: ; stel: mfc1 $6, $f14 ; stel: mfc1 $7, $f15 ; stel: jr $25 -; stel: .set $__fn_local_v_sf_df, v_sf_df +; stel: $__fn_local_v_sf_df = v_sf_df ; stel: .end __fn_stub_v_sf_df ; Function Attrs: nounwind @@ -120,7 +120,7 @@ entry: ; stel: mfc1 $5, $f13 ; stel: mfc1 $6, $f14 ; stel: jr $25 -; stel: .set $__fn_local_v_df_sf, v_df_sf +; stel: $__fn_local_v_df_sf = v_df_sf ; stel: .end __fn_stub_v_df_sf ; Function Attrs: nounwind @@ -146,7 +146,7 @@ entry: ; stel: mfc1 $6, $f14 ; stel: mfc1 $7, $f15 ; stel: jr $25 -; stel: .set $__fn_local_v_df_df, v_df_df +; stel: $__fn_local_v_df_df = v_df_df ; stel: .end __fn_stub_v_df_df ; Function Attrs: nounwind @@ -174,7 +174,7 @@ entry: ; stel: addiu $25, $25, %lo(sf_sf) ; stel: mfc1 $4, $f12 ; stel: jr $25 -; stel: .set $__fn_local_sf_sf, sf_sf +; stel: $__fn_local_sf_sf = sf_sf ; stel: .end __fn_stub_sf_sf @@ -196,7 +196,7 @@ entry: ; stel: mfc1 $4, $f12 ; stel: mfc1 $5, $f13 ; stel: jr $25 -; stel: .set $__fn_local_sf_df, sf_df +; stel: $__fn_local_sf_df = sf_df ; stel: .end __fn_stub_sf_df ; Function Attrs: nounwind @@ -221,7 +221,7 @@ entry: ; stel: mfc1 $4, $f12 ; stel: mfc1 $5, $f14 ; stel: jr $25 -; stel: .set $__fn_local_sf_sf_sf, sf_sf_sf +; stel: $__fn_local_sf_sf_sf = sf_sf_sf ; stel: .end __fn_stub_sf_sf_sf ; Function Attrs: nounwind @@ -247,7 +247,7 @@ entry: ; stel: mfc1 $6, $f14 ; stel: mfc1 $7, $f15 ; stel: jr $25 -; stel: .set $__fn_local_sf_sf_df, sf_sf_df +; stel: $__fn_local_sf_sf_df = sf_sf_df ; stel: .end __fn_stub_sf_sf_df ; Function Attrs: nounwind @@ -273,7 +273,7 @@ entry: ; stel: mfc1 $5, $f13 ; stel: mfc1 $6, $f14 ; stel: jr $25 -; stel: .set $__fn_local_sf_df_sf, sf_df_sf +; stel: $__fn_local_sf_df_sf = sf_df_sf ; stel: .end __fn_stub_sf_df_sf ; Function Attrs: nounwind @@ -300,7 +300,7 @@ entry: ; stel: mfc1 $6, $f14 ; stel: mfc1 $7, $f15 ; stel: jr $25 -; stel: .set $__fn_local_sf_df_df, sf_df_df +; stel: $__fn_local_sf_df_df = sf_df_df ; stel: .end __fn_stub_sf_df_df attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" } diff --git a/llvm/test/CodeGen/Mips/mips16ex.ll b/llvm/test/CodeGen/Mips/mips16ex.ll index fb9a44e767516..f4d1125718a9a 100644 --- a/llvm/test/CodeGen/Mips/mips16ex.ll +++ b/llvm/test/CodeGen/Mips/mips16ex.ll @@ -2,7 +2,7 @@ ;16: main: ;16-NEXT: [[TMP:.*]]: -;16-NEXT: .set $func_begin0, [[TMP]] +;16-NEXT: $func_begin0 = [[TMP]] ;16-NEXT: .cfi_startproc ;16-NEXT: .cfi_personality @.str = private unnamed_addr constant [7 x i8] c"hello\0A\00", align 1 diff --git a/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll b/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll index 6299b4e393d9e..3218c77f08c80 100644 --- a/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll +++ b/llvm/test/CodeGen/PowerPC/asm-printer-topological-order.ll @@ -10,6 +10,6 @@ entry: } ; CHECK-LABEL: TestD: -; CHECK: .set TestC, TestD -; CHECK-DAG: .set TestB, TestC -; CHECK-DAG: .set TestA, TestC +; CHECK: TestC = TestD +; CHECK-DAG: TestB = TestC +; CHECK-DAG: TestA = TestC diff --git a/llvm/test/CodeGen/PowerPC/data-align.ll b/llvm/test/CodeGen/PowerPC/data-align.ll index bfedec139369c..42dee13d152a9 100644 --- a/llvm/test/CodeGen/PowerPC/data-align.ll +++ b/llvm/test/CodeGen/PowerPC/data-align.ll @@ -2,23 +2,23 @@ ; RUN: llc < %s -mtriple=powerpc64-unknown-linux | FileCheck %s ; RUN: llc < %s -mtriple=powerpc64le-unknown-linux | FileCheck %s -; CHECK: .set .Li8, +; CHECK: .Li8 = ; CHECK-NEXT: .size .Li8, 1 @i8 = private constant i8 42 -; CHECK: .set .Li16, +; CHECK: .Li16 = ; CHECK-NEXT: .size .Li16, 2 @i16 = private constant i16 42 -; CHECK: .set .Li32, +; CHECK: .Li32 = ; CHECK-NEXT: .size .Li32, 4 @i32 = private constant i32 42 -; CHECK: .set .Li64, +; CHECK: .Li64 = ; CHECK-NEXT: .size .Li64, 8 @i64 = private constant i64 42 -; CHECK: .set .Li128, +; CHECK: .Li128 = ; CHECK-NEXT: .size .Li128, 16 @i128 = private constant i128 42 diff --git a/llvm/test/CodeGen/WebAssembly/aliases.ll b/llvm/test/CodeGen/WebAssembly/aliases.ll index 91b57b90df1d6..87b292f53c625 100644 --- a/llvm/test/CodeGen/WebAssembly/aliases.ll +++ b/llvm/test/CodeGen/WebAssembly/aliases.ll @@ -4,11 +4,11 @@ @bar = global i32 42 ; CHECK-DAG: .globl foo1 -; CHECK-DAG: .set foo1, bar +; CHECK-DAG: foo1 = bar @foo1 = alias i32, ptr @bar ; CHECK-DAG: .globl foo2 -; CHECK-DAG: .set foo2, bar +; CHECK-DAG: foo2 = bar @foo2 = alias i32, ptr @bar %FunTy = type i32() @@ -19,14 +19,14 @@ define i32 @foo_f() { ; CHECK-DAG: .weak bar_f ; CHECK-DAG: .type bar_f,@function -; CHECK-DAG: .set bar_f, foo_f +; CHECK-DAG: bar_f = foo_f @bar_f = weak alias %FunTy, ptr @foo_f ; CHECK-DAG: .weak bar_l -; CHECK-DAG: .set bar_l, bar +; CHECK-DAG: bar_l = bar @bar_l = linkonce_odr alias i32, ptr @bar -; CHECK-DAG: .set bar_i, bar +; CHECK-DAG: bar_i = bar @bar_i = internal alias i32, ptr @bar ; CHECK-DAG: .globl A @@ -34,24 +34,24 @@ define i32 @foo_f() { ; CHECK-DAG: .globl bar_h ; CHECK-DAG: .hidden bar_h -; CHECK-DAG: .set bar_h, bar +; CHECK-DAG: bar_h = bar @bar_h = hidden alias i32, ptr @bar ; CHECK-DAG: .globl bar_p ; CHECK-DAG: .protected bar_p -; CHECK-DAG: .set bar_p, bar +; CHECK-DAG: bar_p = bar @bar_p = protected alias i32, ptr @bar -; CHECK-DAG: .set test2, bar+4 +; CHECK-DAG: test2 = bar+4 @test2 = alias i32, getelementptr(i32, ptr @bar, i32 1) -; CHECK-DAG: .set test3, 42 +; CHECK-DAG: test3 = 42 @test3 = alias i32, inttoptr(i32 42 to ptr) -; CHECK-DAG: .set test4, bar +; CHECK-DAG: test4 = bar @test4 = alias i32, inttoptr(i64 ptrtoint (ptr @bar to i64) to ptr) -; CHECK-DAG: .set test5, test2-bar +; CHECK-DAG: test5 = test2-bar @test5 = alias i32, inttoptr(i32 sub (i32 ptrtoint (ptr @test2 to i32), i32 ptrtoint (ptr @bar to i32)) to ptr) diff --git a/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll b/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll index 7a5baa09f95e9..10985de88bf2e 100644 --- a/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll +++ b/llvm/test/CodeGen/WinCFGuard/cfguard-mingw.ll @@ -35,7 +35,7 @@ ; } ;------------------------------------------------------------------------------- -; CHECK: .set @feat.00, 2048 +; CHECK: @feat.00 = 2048 ; CHECK: .section .gfids$y ; CHECK: .symidx _ZNK7Derived4calcEv diff --git a/llvm/test/CodeGen/WinCFGuard/cfguard.ll b/llvm/test/CodeGen/WinCFGuard/cfguard.ll index 2ec2e573f7164..a77d5490ef876 100644 --- a/llvm/test/CodeGen/WinCFGuard/cfguard.ll +++ b/llvm/test/CodeGen/WinCFGuard/cfguard.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc | FileCheck %s ; Control Flow Guard is currently only available on Windows -; CHECK: .set @feat.00, 2048 +; CHECK: @feat.00 = 2048 ; CHECK: .section .gfids$y ; CHECK: .symidx "?address_taken@@YAXXZ" diff --git a/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll b/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll index d59953fb4e37d..cc80f87fda311 100644 --- a/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll +++ b/llvm/test/CodeGen/X86/2007-09-06-ExtWeakAliasee.ll @@ -10,4 +10,4 @@ define weak i32 @pthread_once(ptr, ptr) { ; CHECK: pthread_once: ; CHECK: .weak __gthrw_pthread_once -; CHECK: .set __gthrw_pthread_once, pthread_once +; CHECK: __gthrw_pthread_once = pthread_once diff --git a/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll b/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll index 7050889d71029..527684f5a27db 100644 --- a/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll +++ b/llvm/test/CodeGen/X86/2009-08-12-badswitch.ll @@ -125,31 +125,31 @@ define internal fastcc i32 @foo(i64 %bar) nounwind ssp { ; CHECK-NEXT: retq ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .data_region jt32 -; CHECK-NEXT: .set L0_0_set_3, LBB0_3-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_4, LBB0_4-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_5, LBB0_5-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_6, LBB0_6-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_7, LBB0_7-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_8, LBB0_8-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_9, LBB0_9-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_10, LBB0_10-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_11, LBB0_11-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_12, LBB0_12-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_13, LBB0_13-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_14, LBB0_14-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_15, LBB0_15-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_16, LBB0_16-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_17, LBB0_17-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_18, LBB0_18-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_19, LBB0_19-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_20, LBB0_20-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_21, LBB0_21-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_22, LBB0_22-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_23, LBB0_23-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_24, LBB0_24-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_25, LBB0_25-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_26, LBB0_26-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_27, LBB0_27-LJTI0_0 +; CHECK-NEXT: L0_0_set_3 = LBB0_3-LJTI0_0 +; CHECK-NEXT: L0_0_set_4 = LBB0_4-LJTI0_0 +; CHECK-NEXT: L0_0_set_5 = LBB0_5-LJTI0_0 +; CHECK-NEXT: L0_0_set_6 = LBB0_6-LJTI0_0 +; CHECK-NEXT: L0_0_set_7 = LBB0_7-LJTI0_0 +; CHECK-NEXT: L0_0_set_8 = LBB0_8-LJTI0_0 +; CHECK-NEXT: L0_0_set_9 = LBB0_9-LJTI0_0 +; CHECK-NEXT: L0_0_set_10 = LBB0_10-LJTI0_0 +; CHECK-NEXT: L0_0_set_11 = LBB0_11-LJTI0_0 +; CHECK-NEXT: L0_0_set_12 = LBB0_12-LJTI0_0 +; CHECK-NEXT: L0_0_set_13 = LBB0_13-LJTI0_0 +; CHECK-NEXT: L0_0_set_14 = LBB0_14-LJTI0_0 +; CHECK-NEXT: L0_0_set_15 = LBB0_15-LJTI0_0 +; CHECK-NEXT: L0_0_set_16 = LBB0_16-LJTI0_0 +; CHECK-NEXT: L0_0_set_17 = LBB0_17-LJTI0_0 +; CHECK-NEXT: L0_0_set_18 = LBB0_18-LJTI0_0 +; CHECK-NEXT: L0_0_set_19 = LBB0_19-LJTI0_0 +; CHECK-NEXT: L0_0_set_20 = LBB0_20-LJTI0_0 +; CHECK-NEXT: L0_0_set_21 = LBB0_21-LJTI0_0 +; CHECK-NEXT: L0_0_set_22 = LBB0_22-LJTI0_0 +; CHECK-NEXT: L0_0_set_23 = LBB0_23-LJTI0_0 +; CHECK-NEXT: L0_0_set_24 = LBB0_24-LJTI0_0 +; CHECK-NEXT: L0_0_set_25 = LBB0_25-LJTI0_0 +; CHECK-NEXT: L0_0_set_26 = LBB0_26-LJTI0_0 +; CHECK-NEXT: L0_0_set_27 = LBB0_27-LJTI0_0 ; CHECK-NEXT: LJTI0_0: ; CHECK-NEXT: .long L0_0_set_3 ; CHECK-NEXT: .long L0_0_set_3 diff --git a/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll index cf20cfaced5d0..17df3e10fd3d9 100644 --- a/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll +++ b/llvm/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll @@ -64,15 +64,15 @@ attributes #1 = { nounwind readnone } ; CHECK-NEXT: [[CLOBBER:Ltmp[0-9]*]] ; CHECK: Ldebug_loc0: -; CHECK-NEXT: .set [[SET1:.*]], Lfunc_begin0-Lfunc_begin0 +; CHECK-NEXT: [[SET1:.*]] = Lfunc_begin0-Lfunc_begin0 ; CHECK-NEXT: .quad [[SET1]] -; CHECK-NEXT: .set [[SET2:.*]], [[LABEL]]-Lfunc_begin0 +; CHECK-NEXT: [[SET2:.*]] = [[LABEL]]-Lfunc_begin0 ; CHECK-NEXT: .quad [[SET2]] ; CHECK-NEXT: .short 1 ## Loc expr size ; CHECK-NEXT: .byte 85 -; CHECK-NEXT: .set [[SET3:.*]], [[LABEL]]-Lfunc_begin0 +; CHECK-NEXT: [[SET3:.*]] = [[LABEL]]-Lfunc_begin0 ; CHECK-NEXT: .quad [[SET3]] -; CHECK-NEXT: .set [[SET4:.*]], [[CLOBBER]]-Lfunc_begin0 +; CHECK-NEXT: [[SET4:.*]] = [[CLOBBER]]-Lfunc_begin0 ; CHECK-NEXT: .quad [[SET4]] ; CHECK-NEXT: .short 1 ## Loc expr size ; CHECK-NEXT: .byte 83 diff --git a/llvm/test/CodeGen/X86/alias-gep.ll b/llvm/test/CodeGen/X86/alias-gep.ll index 904a611f61d1c..65d2ced6df5ba 100644 --- a/llvm/test/CodeGen/X86/alias-gep.ll +++ b/llvm/test/CodeGen/X86/alias-gep.ll @@ -3,17 +3,17 @@ ;MACHO: .globl _offsetSym0 ;MACHO-NOT: .alt_entry -;MACHO: .set _offsetSym0, _s +;MACHO: _offsetSym0 = _s ;MACHO: .globl _offsetSym1 ;MACHO: .alt_entry _offsetSym1 -;MACHO: .set _offsetSym1, _s+8 +;MACHO: _offsetSym1 = _s+8 ;ELF: .globl offsetSym0 ;ELF-NOT: .alt_entry -;ELF: .set offsetSym0, s +;ELF: offsetSym0 = s ;ELF: .globl offsetSym1 ;ELF-NOT: .alt_entry -;ELF: .set offsetSym1, s+8 +;ELF: offsetSym1 = s+8 %struct.S1 = type { i32, i32, i32 } diff --git a/llvm/test/CodeGen/X86/aliases.ll b/llvm/test/CodeGen/X86/aliases.ll index 03ea2579d0f8a..d36798820fe83 100644 --- a/llvm/test/CodeGen/X86/aliases.ll +++ b/llvm/test/CodeGen/X86/aliases.ll @@ -48,16 +48,16 @@ define i32 @foo_f() { ; CHECK-DAG: .protected bar_p @bar_p = protected alias i32, ptr @bar -; CHECK-DAG: .set test2, bar+4 +; CHECK-DAG: test2 = bar+4 @test2 = alias i32, getelementptr(i32, ptr @bar, i32 1) -; CHECK-DAG: .set test3, 42 +; CHECK-DAG: test3 = 42 @test3 = alias i32, inttoptr(i32 42 to ptr) -; CHECK-DAG: .set test4, bar +; CHECK-DAG: test4 = bar @test4 = alias i32, inttoptr(i64 ptrtoint (ptr @bar to i64) to ptr) -; CHECK-DAG: .set test5, test2-bar +; CHECK-DAG: test5 = test2-bar @test5 = alias i32, inttoptr(i32 sub (i32 ptrtoint (ptr @test2 to i32), i32 ptrtoint (ptr @bar to i32)) to ptr) diff --git a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll index 437d9698ee6bd..ab9fa2287ffad 100644 --- a/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll +++ b/llvm/test/CodeGen/X86/catchret-empty-fallthrough.ll @@ -44,7 +44,7 @@ return: ; preds = %catch, %entry ; CHECK: .LBB0_[[catch:[0-9]+]]: ; CHECK: .seh_handlerdata -; CHECK-NEXT: .set .Lfoo$parent_frame_offset, 32 +; CHECK-NEXT: .Lfoo$parent_frame_offset = 32 ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16 ; CHECK-NEXT: .Llsda_begin0: ; CHECK-NEXT: .long .Ltmp0@IMGREL diff --git a/llvm/test/CodeGen/X86/coff-alias-type.ll b/llvm/test/CodeGen/X86/coff-alias-type.ll index a242cd2d77d7c..6cc0638b2d4af 100644 --- a/llvm/test/CodeGen/X86/coff-alias-type.ll +++ b/llvm/test/CodeGen/X86/coff-alias-type.ll @@ -22,4 +22,4 @@ entry: ; CHECK-NEXT: .scl 2 ; CHECK-NEXT: .type 32 ; CHECK-NEXT: .endef -; CHECK-NEXT: .set _ZN8MyStructC1Ev, _ZN8MyStructC2Ev +; CHECK-NEXT: _ZN8MyStructC1Ev = _ZN8MyStructC2Ev diff --git a/llvm/test/CodeGen/X86/coff-comdat.ll b/llvm/test/CodeGen/X86/coff-comdat.ll index 99b3c0a687afb..084a5a71125ee 100644 --- a/llvm/test/CodeGen/X86/coff-comdat.ll +++ b/llvm/test/CodeGen/X86/coff-comdat.ll @@ -89,4 +89,4 @@ $vftable = comdat largest ; CHECK: .globl _f6 ; CHECK: .section .rdata,"dr",largest,_vftable ; CHECK: .globl _vftable -; CHECK: .set _vftable, L_some_name+4 +; CHECK: _vftable = L_some_name+4 diff --git a/llvm/test/CodeGen/X86/coff-feat00.ll b/llvm/test/CodeGen/X86/coff-feat00.ll index 21dd04ed34c7e..1dcd4276399a9 100644 --- a/llvm/test/CodeGen/X86/coff-feat00.ll +++ b/llvm/test/CodeGen/X86/coff-feat00.ll @@ -4,4 +4,4 @@ define i32 @foo() { ret i32 0 } -; CHECK: .set @feat.00, 1 +; CHECK: @feat.00 = 1 diff --git a/llvm/test/CodeGen/X86/dllexport-x86_64.ll b/llvm/test/CodeGen/X86/dllexport-x86_64.ll index 76add98314f5c..b640e630e47e6 100644 --- a/llvm/test/CodeGen/X86/dllexport-x86_64.ll +++ b/llvm/test/CodeGen/X86/dllexport-x86_64.ll @@ -105,23 +105,23 @@ define weak_odr dllexport void @weak1() { ; MINGW: .ascii " -export:blob_alias" ; CHECK: .globl alias -; CHECK: .set alias, notExported +; CHECK: alias = notExported @alias = dllexport alias void(), ptr @notExported ; CHECK: .globl aliasNotExported -; CHECK: .set aliasNotExported, f1 +; CHECK: aliasNotExported = f1 @aliasNotExported = alias void(), ptr @f1 ; CHECK: .globl alias2 -; CHECK: .set alias2, f1 +; CHECK: alias2 = f1 @alias2 = dllexport alias void(), ptr @f1 ; CHECK: .globl alias3 -; CHECK: .set alias3, notExported +; CHECK: alias3 = notExported @alias3 = dllexport alias void(), ptr @notExported ; CHECK: .weak weak_alias -; CHECK: .set weak_alias, f1 +; CHECK: weak_alias = f1 @weak_alias = weak_odr dllexport alias void(), ptr @f1 @blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16 diff --git a/llvm/test/CodeGen/X86/dllexport.ll b/llvm/test/CodeGen/X86/dllexport.ll index 09cc03e7729d9..53ecb8e7a1b4f 100644 --- a/llvm/test/CodeGen/X86/dllexport.ll +++ b/llvm/test/CodeGen/X86/dllexport.ll @@ -135,17 +135,17 @@ define weak_odr dllexport void @weak1() { ; CHECK-GCC: .ascii " -export:weak_alias" ; CHECK: .globl _alias -; CHECK: .set _alias, _notExported +; CHECK: _alias = _notExported @alias = dllexport alias void(), ptr @notExported ; CHECK: .globl _alias2 -; CHECK: .set _alias2, _f1 +; CHECK: _alias2 = _f1 @alias2 = dllexport alias void(), ptr @f1 ; CHECK: .globl _alias3 -; CHECK: .set _alias3, _notExported +; CHECK: _alias3 = _notExported @alias3 = dllexport alias void(), ptr @notExported ; CHECK: .weak _weak_alias -; CHECK: .set _weak_alias, _f1 +; CHECK: _weak_alias = _f1 @weak_alias = weak_odr dllexport alias void(), ptr @f1 diff --git a/llvm/test/CodeGen/X86/ehcontguard.ll b/llvm/test/CodeGen/X86/ehcontguard.ll index 740621bc5d025..e868209babce6 100644 --- a/llvm/test/CodeGen/X86/ehcontguard.ll +++ b/llvm/test/CodeGen/X86/ehcontguard.ll @@ -1,7 +1,7 @@ ; RUN: llc < %s -mtriple=x86_64-pc-windows-msvc | FileCheck %s ; EHCont Guard is currently only available on Windows -; CHECK: .set @feat.00, 16384 +; CHECK: @feat.00 = 16384 ; CHECK: .section .gehcont$y diff --git a/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll b/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll index 53b4bc8f1df2e..4840308a5d498 100644 --- a/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll +++ b/llvm/test/CodeGen/X86/fastcall-correct-mangling.ll @@ -33,5 +33,5 @@ define private x86_fastcallcc void @dontCrash() { } @alias = alias void(i64, i8, i8, i16), ptr @func -; CHECK32-LABEL: {{^}}.set @alias@20, @func@20 -; CHECK64-LABEL: {{^}}.set alias, func +; CHECK32-LABEL: {{^}}@alias@20 = @func@20 +; CHECK64-LABEL: {{^}}alias = func diff --git a/llvm/test/CodeGen/X86/ifunc-asm.ll b/llvm/test/CodeGen/X86/ifunc-asm.ll index a4c47da7f4c65..bc8e7e3d7d05b 100644 --- a/llvm/test/CodeGen/X86/ifunc-asm.ll +++ b/llvm/test/CodeGen/X86/ifunc-asm.ll @@ -15,7 +15,7 @@ entry: @foo_ifunc = ifunc i32 (i32), ptr @foo_resolver ; ELF: .globl foo_ifunc ; ELF-NEXT: .type foo_ifunc,@gnu_indirect_function -; ELF-NEXT: .set foo_ifunc, foo_resolver +; ELF-NEXT: foo_ifunc = foo_resolver ; MACHO: .section __DATA,__data ; MACHO-NEXT: .p2align 3, 0x0 diff --git a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll index b8f0661225f82..5199b1519ebea 100644 --- a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll +++ b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll @@ -47,9 +47,9 @@ entry: call fastcc void @"\01?fin$0@0@test2@@"(ptr %tmp0) ret void ; CHECK-LABEL: test2: -; CHECK: .set Ltest2$frame_escape_0, 8 -; CHECK: .set Ltest2$frame_escape_1, 4 -; CHECK: .set Ltest2$frame_escape_2, 0 +; CHECK: Ltest2$frame_escape_0 = 8 +; CHECK: Ltest2$frame_escape_1 = 4 +; CHECK: Ltest2$frame_escape_2 = 0 ; CHECK: calll "?fin$0@0@test2@@" } diff --git a/llvm/test/CodeGen/X86/linux-preemption.ll b/llvm/test/CodeGen/X86/linux-preemption.ll index 8e60b47879754..dc06a34e1c692 100644 --- a/llvm/test/CodeGen/X86/linux-preemption.ll +++ b/llvm/test/CodeGen/X86/linux-preemption.ll @@ -285,18 +285,18 @@ define dso_local ptr @comdat_any_local() comdat { ; CHECK-NEXT: .Lstrong_local_global$local: ; COMMON: .globl strong_default_alias -; COMMON-NEXT: .set strong_default_alias, aliasee +; COMMON-NEXT: strong_default_alias = aliasee ; COMMON-NEXT: .globl strong_hidden_alias ; COMMON-NEXT: .hidden strong_hidden_alias -; COMMON-NEXT: .set strong_hidden_alias, aliasee +; COMMON-NEXT: strong_hidden_alias = aliasee ; COMMON-NEXT: .weak weak_default_alias -; COMMON-NEXT: .set weak_default_alias, aliasee +; COMMON-NEXT: weak_default_alias = aliasee ; COMMON-NEXT: .globl strong_local_alias -; COMMON-NEXT: .set strong_local_alias, aliasee -; CHECK-NEXT: .set .Lstrong_local_alias$local, aliasee +; COMMON-NEXT: strong_local_alias = aliasee +; CHECK-NEXT: .Lstrong_local_alias$local = aliasee ; COMMON-NEXT: .weak weak_local_alias -; COMMON-NEXT: .set weak_local_alias, aliasee +; COMMON-NEXT: weak_local_alias = aliasee ; COMMON-NEXT: .globl strong_preemptable_alias -; COMMON-NEXT: .set strong_preemptable_alias, aliasee +; COMMON-NEXT: strong_preemptable_alias = aliasee ; COMMON-NEXT: .weak weak_preemptable_alias -; COMMON-NEXT: .set weak_preemptable_alias, aliasee +; COMMON-NEXT: weak_preemptable_alias = aliasee diff --git a/llvm/test/CodeGen/X86/localescape.ll b/llvm/test/CodeGen/X86/localescape.ll index aee7613273f75..57369be489af3 100644 --- a/llvm/test/CodeGen/X86/localescape.ll +++ b/llvm/test/CodeGen/X86/localescape.ll @@ -76,8 +76,8 @@ define void @alloc_func(i32 %n) { ; X64: .seh_stackalloc 16 ; X64: leaq 16(%rsp), %rbp ; X64: .seh_setframe %rbp, 16 -; X64: .set .Lalloc_func$frame_escape_0, -4 -; X64: .set .Lalloc_func$frame_escape_1, -12 +; X64: .Lalloc_func$frame_escape_0 = -4 +; X64: .Lalloc_func$frame_escape_1 = -12 ; X64: movl $42, -4(%rbp) ; X64: movl $13, -12(%rbp) ; X64: movq %rbp, %rcx @@ -88,8 +88,8 @@ define void @alloc_func(i32 %n) { ; X86: pushl %ebp ; X86: movl %esp, %ebp ; X86: subl $12, %esp -; X86: .set Lalloc_func$frame_escape_0, -4 -; X86: .set Lalloc_func$frame_escape_1, -12 +; X86: Lalloc_func$frame_escape_0 = -4 +; X86: Lalloc_func$frame_escape_1 = -12 ; X86: movl $42, -4(%ebp) ; X86: movl $13, -12(%ebp) ; X86: pushl %ebp @@ -118,8 +118,8 @@ define void @alloc_func_no_frameaddr() { ; X64: subq $40, %rsp ; X64: .seh_stackalloc 40 ; X64: .seh_endprologue -; X64: .set .Lalloc_func_no_frameaddr$frame_escape_0, 36 -; X64: .set .Lalloc_func_no_frameaddr$frame_escape_1, 32 +; X64: .Lalloc_func_no_frameaddr$frame_escape_0 = 36 +; X64: .Lalloc_func_no_frameaddr$frame_escape_1 = 32 ; X64: movl $42, 36(%rsp) ; X64: movl $13, 32(%rsp) ; X64: xorl %ecx, %ecx @@ -131,8 +131,8 @@ define void @alloc_func_no_frameaddr() { ; X86-LABEL: alloc_func_no_frameaddr: ; X86: subl $8, %esp -; X86: .set Lalloc_func_no_frameaddr$frame_escape_0, 4 -; X86: .set Lalloc_func_no_frameaddr$frame_escape_1, 0 +; X86: Lalloc_func_no_frameaddr$frame_escape_0 = 4 +; X86: Lalloc_func_no_frameaddr$frame_escape_1 = 0 ; X86: movl $42, 4(%esp) ; X86: movl $13, (%esp) ; X86: pushl $0 diff --git a/llvm/test/CodeGen/X86/pr22019.ll b/llvm/test/CodeGen/X86/pr22019.ll index 4e78bae204428..262ee5fad7375 100644 --- a/llvm/test/CodeGen/X86/pr22019.ll +++ b/llvm/test/CodeGen/X86/pr22019.ll @@ -5,9 +5,9 @@ target triple = "x86_64-unknown-linux-gnu" module asm "pselect = __pselect" module asm "var = __var" module asm "alias = __alias" -; CHECK: .set pselect, __pselect -; CHECK: .set var, __var -; CHECK: .set alias, __alias +; CHECK: pselect = __pselect +; CHECK: var = __var +; CHECK: alias = __alias ; CHECK: pselect: ; CHECK: retq @@ -19,5 +19,5 @@ define void @pselect() { ; CHECK: .long 0 @var = global i32 0 -; CHECK: .set alias, var +; CHECK: alias = var @alias = alias i32, ptr @var diff --git a/llvm/test/CodeGen/X86/seh-catch-all-win32.ll b/llvm/test/CodeGen/X86/seh-catch-all-win32.ll index 3acf999fc4237..bd51ca76c59d1 100644 --- a/llvm/test/CodeGen/X86/seh-catch-all-win32.ll +++ b/llvm/test/CodeGen/X86/seh-catch-all-win32.ll @@ -58,7 +58,7 @@ entry: ; CHECK: pushl %edi ; CHECK: pushl %esi -; CHECK: .set Lmain$frame_escape_0, [[code_offs:[-0-9]+]] +; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]] ; CHECK: movl %esp, [[reg_offs:[-0-9]+]](%ebp) ; CHECK: movl $L__ehtable$main, ; EH state 0 @@ -78,7 +78,7 @@ entry: ; CHECK: calll _printf ; CHECK: .section .xdata,"dr" -; CHECK: .set Lmain$parent_frame_offset, [[reg_offs]] +; CHECK: Lmain$parent_frame_offset = [[reg_offs]] ; CHECK: .p2align 2 ; CHECK: L__ehtable$main ; CHECK-NEXT: .long -1 diff --git a/llvm/test/CodeGen/X86/seh-catchpad.ll b/llvm/test/CodeGen/X86/seh-catchpad.ll index 7558c4389be59..d958580e5925b 100644 --- a/llvm/test/CodeGen/X86/seh-catchpad.ll +++ b/llvm/test/CodeGen/X86/seh-catchpad.ll @@ -119,7 +119,7 @@ __except.ret: ; preds = %catch.dispatch.7 ; CHECK: jmp .LBB1_[[epilogue]] ; CHECK: .seh_handlerdata -; CHECK-NEXT: .set .Lmain$parent_frame_offset, 32 +; CHECK-NEXT: .Lmain$parent_frame_offset = 32 ; CHECK-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16 ; CHECK-NEXT: .Llsda_begin0: ; CHECK-NEXT: .long .Ltmp0@IMGREL diff --git a/llvm/test/CodeGen/X86/seh-finally.ll b/llvm/test/CodeGen/X86/seh-finally.ll index 28e5cf68dd27e..41823dfb38f0a 100644 --- a/llvm/test/CodeGen/X86/seh-finally.ll +++ b/llvm/test/CodeGen/X86/seh-finally.ll @@ -26,7 +26,7 @@ lpad: ; preds = %entry ; X64: retq ; X64: .seh_handlerdata -; X64-NEXT: .set .Lmain$parent_frame_offset, 32 +; X64-NEXT: .Lmain$parent_frame_offset = 32 ; X64-NEXT: .long (.Llsda_end0-.Llsda_begin0)/16 # Number of call sites ; X64-NEXT: .Llsda_begin0: ; X64-NEXT: .long .Ltmp0@IMGREL # LabelStart diff --git a/llvm/test/CodeGen/X86/seh-no-invokes.ll b/llvm/test/CodeGen/X86/seh-no-invokes.ll index 99b81f0eb1bb4..63e91d33d4006 100644 --- a/llvm/test/CodeGen/X86/seh-no-invokes.ll +++ b/llvm/test/CodeGen/X86/seh-no-invokes.ll @@ -15,7 +15,7 @@ ; label. This was PR30431. ; CHECK-LABEL: _f: # @f -; CHECK: .set Lf$parent_frame_offset, 0 +; CHECK: Lf$parent_frame_offset = 0 ; CHECK: retl ; CHECK-LABEL: "?filt$0@0@f@@": # @"\01?filt$0@0@f@@" diff --git a/llvm/test/CodeGen/X86/seh-stack-realign.ll b/llvm/test/CodeGen/X86/seh-stack-realign.ll index 2869bff822314..ae687343cc504 100644 --- a/llvm/test/CodeGen/X86/seh-stack-realign.ll +++ b/llvm/test/CodeGen/X86/seh-stack-realign.ll @@ -51,7 +51,7 @@ entry: ; Check that we can get the exception code from eax to the printf. ; CHECK-LABEL: _main: -; CHECK: .set Lmain$frame_escape_0, [[code_offs:[-0-9]+]] +; CHECK: Lmain$frame_escape_0 = [[code_offs:[-0-9]+]] ; CHECK: movl %esp, [[reg_offs:[-0-9]+]](%esi) ; CHECK: movl $L__ehtable$main, ; EH state 0 @@ -71,7 +71,7 @@ entry: ; CHECK: calll _printf ; CHECK: .section .xdata,"dr" -; CHECK: .set Lmain$parent_frame_offset, [[reg_offs]] +; CHECK: Lmain$parent_frame_offset = [[reg_offs]] ; CHECK: L__ehtable$main ; CHECK-NEXT: .long -1 ; CHECK-NEXT: .long _filt$main diff --git a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll index d8fcf6d86fa4d..ecbbaf3ab362d 100644 --- a/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll +++ b/llvm/test/CodeGen/X86/tailcall-cgp-dup.ll @@ -34,12 +34,12 @@ define i32 @foo(i32 %x) nounwind ssp { ; CHECK-NEXT: retq ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .data_region jt32 -; CHECK-NEXT: .set L0_0_set_2, LBB0_2-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_3, LBB0_3-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_4, LBB0_4-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_5, LBB0_5-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_6, LBB0_6-LJTI0_0 -; CHECK-NEXT: .set L0_0_set_7, LBB0_7-LJTI0_0 +; CHECK-NEXT: L0_0_set_2 = LBB0_2-LJTI0_0 +; CHECK-NEXT: L0_0_set_3 = LBB0_3-LJTI0_0 +; CHECK-NEXT: L0_0_set_4 = LBB0_4-LJTI0_0 +; CHECK-NEXT: L0_0_set_5 = LBB0_5-LJTI0_0 +; CHECK-NEXT: L0_0_set_6 = LBB0_6-LJTI0_0 +; CHECK-NEXT: L0_0_set_7 = LBB0_7-LJTI0_0 ; CHECK-NEXT: LJTI0_0: ; CHECK-NEXT: .long L0_0_set_2 ; CHECK-NEXT: .long L0_0_set_3 diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll index 16322cbe9980e..9e44299083d46 100644 --- a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll +++ b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll @@ -2,7 +2,7 @@ ; CHECK-LABEL: "?fin$0@0@main@@" ; CHECK: .seh_handlerdata -; CHECK: .set ".L?fin$0@0@main@@$parent_frame_offset", 48 +; CHECK: ".L?fin$0@0@main@@$parent_frame_offset" = 48 ; CHECK-NEXT: .long (.Llsda_end1-.Llsda_begin1)/16 ; CHECK-NEXT: .Llsda_begin1: ; CHECK-NEXT: .long .Ltmp diff --git a/llvm/test/CodeGen/XCore/globals.ll b/llvm/test/CodeGen/XCore/globals.ll index 134bbb3444b5d..186cfda97104d 100644 --- a/llvm/test/CodeGen/XCore/globals.ll +++ b/llvm/test/CodeGen/XCore/globals.ll @@ -127,4 +127,4 @@ entry: @array = global [10 x i16] zeroinitializer, align 2 ; CHECK: .globl array.globound -; CHECK: .set array.globound, 10 +; CHECK: array.globound = 10 diff --git a/llvm/test/CodeGen/XCore/linkage.ll b/llvm/test/CodeGen/XCore/linkage.ll index 93edf01cf8a96..5bfb83d964dfa 100644 --- a/llvm/test/CodeGen/XCore/linkage.ll +++ b/llvm/test/CodeGen/XCore/linkage.ll @@ -19,14 +19,14 @@ define protected void @test_protected() { } ; CHECK: .globl array.globound -; CHECK: .set array.globound, 2 +; CHECK: array.globound = 2 ; CHECK: .weak array.globound ; CHECK: .globl array ; CHECK: .weak array @array = weak global [2 x i32] zeroinitializer ; CHECK: .globl ac.globound -; CHECK: .set ac.globound, 2 +; CHECK: ac.globound = 2 ; CHECK: .weak ac.globound ; CHECK: .globl ac ; CHECK: .weak ac diff --git a/llvm/test/DebugInfo/X86/dbg-value-range.ll b/llvm/test/DebugInfo/X86/dbg-value-range.ll index 0d49b5eeefd1b..a6ede2814aba3 100644 --- a/llvm/test/DebugInfo/X86/dbg-value-range.ll +++ b/llvm/test/DebugInfo/X86/dbg-value-range.ll @@ -49,9 +49,9 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) nounwind readnone ;CHECK-NEXT: [[CLOBBER:Ltmp[0-9]*]] ;CHECK:Ldebug_loc0: -;CHECK-NEXT: .set Lset{{.*}}, +;CHECK-NEXT: Lset{{.*}} = ;CHECK-NEXT: .quad -;CHECK-NEXT: .set [[CLOBBER_OFF:Lset.*]], [[CLOBBER]]-{{.*}} +;CHECK-NEXT: [[CLOBBER_OFF:Lset.*]] = [[CLOBBER]]-{{.*}} ;CHECK-NEXT: .quad [[CLOBBER_OFF]] ;CHECK-NEXT: .short 1 ## Loc expr size ;CHECK-NEXT: .byte 85 ## DW_OP_reg diff --git a/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll b/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll index 446f31f9a9126..8d4d065641fca 100644 --- a/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll +++ b/llvm/test/DebugInfo/X86/stmt-list-multiple-compile-units.ll @@ -64,11 +64,11 @@ ; PR15408 ; ASM: Lcu_begin0: ; ASM-NOT: Lcu_begin -; ASM: .set Lset[[LT:[0-9]+]], Lline_table_start0-Lsection_line ## DW_AT_stmt_list +; ASM: Lset[[LT:[0-9]+]] = Lline_table_start0-Lsection_line ## DW_AT_stmt_list ; ASM-NEXT: .long Lset[[LT]] ; ASM: Lcu_begin1: ; ASM-NOT: Lcu_begin -; ASM: .set Lset[[LT:[0-9]+]], Lline_table_start0-Lsection_line ## DW_AT_stmt_list +; ASM: Lset[[LT:[0-9]+]] = Lline_table_start0-Lsection_line ## DW_AT_stmt_list ; ASM-NEXT: .long Lset[[LT]] define i32 @test(i32 %a) nounwind uwtable ssp !dbg !5 { entry: diff --git a/llvm/test/MC/AArch64/basic-a64-instructions.s b/llvm/test/MC/AArch64/basic-a64-instructions.s index 14ac11f581a55..b2ec5b6ac3678 100644 --- a/llvm/test/MC/AArch64/basic-a64-instructions.s +++ b/llvm/test/MC/AArch64/basic-a64-instructions.s @@ -3349,7 +3349,7 @@ _func: .equ equvalue, 0x0001 movk x1, equvalue, lsl 16 -// CHECK: .set equvalue, 1 +// CHECK: equvalue = 1 // CHECK-NEXT: movk x1, #1, lsl #16 // encoding: [0x21,0x00,0xa0,0xf2] movz x2, #:abs_g0:sym diff --git a/llvm/test/MC/AsmParser/assignment.s b/llvm/test/MC/AsmParser/assignment.s index 6f84a1c338dad..8c8984c12ac36 100644 --- a/llvm/test/MC/AsmParser/assignment.s +++ b/llvm/test/MC/AsmParser/assignment.s @@ -1,22 +1,22 @@ # RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s # CHECK: TEST0: -# CHECK: .set a, 0 +# CHECK: a = 0 TEST0: a = 0 # CHECK: TEST1: -# CHECK: .set b, 0 +# CHECK: b = 0 TEST1: - .set b, 0 + b = 0 # CHECK: .globl _f1 -# CHECK: .set _f1, 0 +# CHECK: _f1 = 0 .globl _f1 _f1 = 0 # CHECK: .globl _f2 -# CHECK: .set _f2, 0 +# CHECK: _f2 = 0 .globl _f2 - .set _f2, 0 + _f2 = 0 diff --git a/llvm/test/MC/AsmParser/directive_include.s b/llvm/test/MC/AsmParser/directive_include.s index 8d2ef2753b23a..f53bc671fc646 100644 --- a/llvm/test/MC/AsmParser/directive_include.s +++ b/llvm/test/MC/AsmParser/directive_include.s @@ -2,7 +2,7 @@ # CHECK: TESTA: # CHECK: TEST0: -# CHECK: .set a, 0 +# CHECK: a = 0 # CHECK: TESTB: TESTA: .include "directive\137set.s" # "\137" is underscore "_" diff --git a/llvm/test/MC/AsmParser/directive_set.s b/llvm/test/MC/AsmParser/directive_set.s index 65dd33d1d54fb..4b93de01b309d 100644 --- a/llvm/test/MC/AsmParser/directive_set.s +++ b/llvm/test/MC/AsmParser/directive_set.s @@ -1,13 +1,13 @@ # RUN: llvm-mc -triple i386-unknown-elf %s | FileCheck %s # CHECK: TEST0: -# CHECK: .set a, 0 +# CHECK: a = 0 # CHECK-NOT: .no_dead_strip a TEST0: - .set a, 0 + a = 0 # CHECK: TEST1: -# CHECK: .set a, 0 +# CHECK: a = 0 # CHECK-NOT: .no_dead_strip a TEST1: .equ a, 0 diff --git a/llvm/test/MC/AsmParser/include.ll b/llvm/test/MC/AsmParser/include.ll index 3321f0a6a2872..22c9eaf7a36e9 100644 --- a/llvm/test/MC/AsmParser/include.ll +++ b/llvm/test/MC/AsmParser/include.ll @@ -10,5 +10,5 @@ entry: ret void } -; CHECK: .set MODULE, 1 -; CHECK: .set FUNCTION, 1 +; CHECK: MODULE = 1 +; CHECK: FUNCTION = 1 diff --git a/llvm/test/MC/AsmParser/labels.s b/llvm/test/MC/AsmParser/labels.s index 599ce72c44eef..6a9870b655f2f 100644 --- a/llvm/test/MC/AsmParser/labels.s +++ b/llvm/test/MC/AsmParser/labels.s @@ -18,12 +18,12 @@ foo: // CHECK: addl $24, a$b+10(%eax) addl $24, ("a$b" + 10)(%eax) -// CHECK: .set b$c, 10 +// CHECK: b$c = 10 "b$c" = 10 // CHECK: addl $10, %eax addl $"b$c", %eax -// CHECK: .set "a 0", 11 +// CHECK: "a 0" = 11 .set "a 0", 11 // CHECK: .long 11 @@ -49,7 +49,7 @@ foo: // CHECX: .lsym "a 8",1 // .lsym "a 8", 1 -// CHECK: .set "a 9", a-b +// CHECK: "a 9" = a-b .set "a 9", a - b // CHECK: .long "a 9" diff --git a/llvm/test/MC/AsmParser/macro-arg-darwin.s b/llvm/test/MC/AsmParser/macro-arg-darwin.s index 8671107539ce7..88c63dd488be4 100644 --- a/llvm/test/MC/AsmParser/macro-arg-darwin.s +++ b/llvm/test/MC/AsmParser/macro-arg-darwin.s @@ -38,7 +38,7 @@ bar .endif .endm .macro bottom - .set fred, $0 + fred = $0 .endm .text @@ -49,7 +49,7 @@ top bar, 42 // CHECK: _foo: // CHECK-NOT: fred // CHECK: _bar -// CHECK-NEXT: .set fred, 42 +// CHECK-NEXT: fred = 42 .macro foo diff --git a/llvm/test/MC/AsmParser/motorola_integers.s b/llvm/test/MC/AsmParser/motorola_integers.s index c75d9a5e0cb14..1ec2e02e97f02 100644 --- a/llvm/test/MC/AsmParser/motorola_integers.s +++ b/llvm/test/MC/AsmParser/motorola_integers.s @@ -1,10 +1,10 @@ # RUN: llvm-mc -triple i386-unknown-unknown -motorola-integers %s | FileCheck %s -# CHECK: .set a, 2882400009 -.set a, $aBcDeF09 -# CHECK: .set b, 256 -.set b, $0100 -# CHECK: .set c, 10 -.set c, %01010 -# CHECK: .set d, 1 -.set d, %1 +# CHECK: a = 2882400009 +a = $aBcDeF09 +# CHECK: b = 256 +b = $0100 +# CHECK: c = 10 +c = %01010 +# CHECK: d = 1 +d = %1 diff --git a/llvm/test/MC/Mips/cpsetup.s b/llvm/test/MC/Mips/cpsetup.s index 4a027c6e796ae..f948d650da94d 100644 --- a/llvm/test/MC/Mips/cpsetup.s +++ b/llvm/test/MC/Mips/cpsetup.s @@ -196,7 +196,7 @@ IMM_8 = 8 # ALL-LABEL: : # ASM-LABEL: t1b: -# ASM-NEXT: .set IMM_8, 8 +# ASM-NEXT: IMM_8 = 8 # O32-NOT: __cerror From 95bbaca6c1dcabb03bd67aabe3aaa4730a11200d Mon Sep 17 00:00:00 2001 From: Rajveer Singh Bharadwaj Date: Thu, 12 Jun 2025 10:54:01 +0530 Subject: [PATCH 187/851] [AArch64] Extend usage of `XAR` instruction for fixed-length operations (#139460) --- .../Target/AArch64/AArch64ISelDAGToDAG.cpp | 102 +++++-- llvm/test/CodeGen/AArch64/xar.ll | 250 +++++++++++++++++- 2 files changed, 324 insertions(+), 28 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp index 11cb91fbe02d4..009d69b2b9433 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -4606,7 +4606,33 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) { return false; } - if (!Subtarget->hasSHA3()) + // We have Neon SHA3 XAR operation for v2i64 but for types + // v4i32, v8i16, v16i8 we can use SVE operations when SVE2-SHA3 + // is available. + EVT SVT; + switch (VT.getSimpleVT().SimpleTy) { + case MVT::v4i32: + case MVT::v2i32: + SVT = MVT::nxv4i32; + break; + case MVT::v8i16: + case MVT::v4i16: + SVT = MVT::nxv8i16; + break; + case MVT::v16i8: + case MVT::v8i8: + SVT = MVT::nxv16i8; + break; + case MVT::v2i64: + case MVT::v1i64: + SVT = Subtarget->hasSHA3() ? MVT::v2i64 : MVT::nxv2i64; + break; + default: + return false; + } + + if ((!SVT.isScalableVector() && !Subtarget->hasSHA3()) || + (SVT.isScalableVector() && !Subtarget->hasSVE2())) return false; if (N0->getOpcode() != AArch64ISD::VSHL || @@ -4632,7 +4658,8 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) { SDValue Imm = CurDAG->getTargetConstant( ShAmt, DL, N0.getOperand(1).getValueType(), false); - if (ShAmt + HsAmt != 64) + unsigned VTSizeInBits = VT.getScalarSizeInBits(); + if (ShAmt + HsAmt != VTSizeInBits) return false; if (!IsXOROperand) { @@ -4640,33 +4667,76 @@ bool AArch64DAGToDAGISel::trySelectXAR(SDNode *N) { SDNode *MOV = CurDAG->getMachineNode(AArch64::MOVIv2d_ns, DL, MVT::v2i64, Zero); SDValue MOVIV = SDValue(MOV, 0); + R1 = N1->getOperand(0); R2 = MOVIV; } - // If the input is a v1i64, widen to a v2i64 to use XAR. - assert((VT == MVT::v1i64 || VT == MVT::v2i64) && "Unexpected XAR type!"); - if (VT == MVT::v1i64) { - EVT SVT = MVT::v2i64; + if (SVT != VT) { SDValue Undef = - SDValue(CurDAG->getMachineNode(AArch64::IMPLICIT_DEF, DL, SVT), 0); - SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); + SDValue(CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, SVT), 0); + + if (SVT.isScalableVector() && VT.is64BitVector()) { + EVT QVT = VT.getDoubleNumVectorElementsVT(*CurDAG->getContext()); + + SDValue UndefQ = SDValue( + CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, QVT), 0); + SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); + + R1 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, QVT, + UndefQ, R1, DSub), + 0); + if (R2.getValueType() == VT) + R2 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, QVT, + UndefQ, R2, DSub), + 0); + } + + SDValue SubReg = CurDAG->getTargetConstant( + (SVT.isScalableVector() ? AArch64::zsub : AArch64::dsub), DL, MVT::i32); + R1 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, SVT, Undef, - R1, DSub), + R1, SubReg), 0); - if (R2.getValueType() == MVT::v1i64) + + if (SVT.isScalableVector() || R2.getValueType() != SVT) R2 = SDValue(CurDAG->getMachineNode(AArch64::INSERT_SUBREG, DL, SVT, - Undef, R2, DSub), + Undef, R2, SubReg), 0); } SDValue Ops[] = {R1, R2, Imm}; - SDNode *XAR = CurDAG->getMachineNode(AArch64::XAR, DL, MVT::v2i64, Ops); + SDNode *XAR = nullptr; + + if (SVT.isScalableVector()) { + if (auto Opc = SelectOpcodeFromVT( + SVT, {AArch64::XAR_ZZZI_B, AArch64::XAR_ZZZI_H, AArch64::XAR_ZZZI_S, + AArch64::XAR_ZZZI_D})) + XAR = CurDAG->getMachineNode(Opc, DL, SVT, Ops); + } else { + XAR = CurDAG->getMachineNode(AArch64::XAR, DL, SVT, Ops); + } - if (VT == MVT::v1i64) { - SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); - XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT, - SDValue(XAR, 0), DSub); + assert(XAR && "Unexpected NULL value for XAR instruction in DAG"); + + if (SVT != VT) { + if (VT.is64BitVector() && SVT.isScalableVector()) { + EVT QVT = VT.getDoubleNumVectorElementsVT(*CurDAG->getContext()); + + SDValue ZSub = CurDAG->getTargetConstant(AArch64::zsub, DL, MVT::i32); + SDNode *Q = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, QVT, + SDValue(XAR, 0), ZSub); + + SDValue DSub = CurDAG->getTargetConstant(AArch64::dsub, DL, MVT::i32); + XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT, + SDValue(Q, 0), DSub); + } else { + SDValue SubReg = CurDAG->getTargetConstant( + (SVT.isScalableVector() ? AArch64::zsub : AArch64::dsub), DL, + MVT::i32); + XAR = CurDAG->getMachineNode(AArch64::EXTRACT_SUBREG, DL, VT, + SDValue(XAR, 0), SubReg); + } } ReplaceNode(N, XAR); return true; diff --git a/llvm/test/CodeGen/AArch64/xar.ll b/llvm/test/CodeGen/AArch64/xar.ll index d682f4f4a1bfb..652617b58eaf3 100644 --- a/llvm/test/CodeGen/AArch64/xar.ll +++ b/llvm/test/CodeGen/AArch64/xar.ll @@ -1,6 +1,9 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4 ; RUN: llc -mtriple=aarch64 -mattr=+sha3 < %s | FileCheck --check-prefix=SHA3 %s ; RUN: llc -mtriple=aarch64 -mattr=-sha3 < %s | FileCheck --check-prefix=NOSHA3 %s +; RUN: llc -mtriple=aarch64 -mattr=+sve2 < %s | FileCheck --check-prefix=SVE2 %s + +/* 128-bit vectors */ define <2 x i64> @xar(<2 x i64> %x, <2 x i64> %y) { ; SHA3-LABEL: xar: @@ -14,6 +17,14 @@ define <2 x i64> @xar(<2 x i64> %x, <2 x i64> %y) { ; NOSHA3-NEXT: shl v0.2d, v1.2d, #10 ; NOSHA3-NEXT: usra v0.2d, v1.2d, #54 ; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE2-NEXT: // kill: def $q1 killed $q1 def $z1 +; SVE2-NEXT: xar z0.d, z0.d, z1.d, #54 +; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2-NEXT: ret %a = xor <2 x i64> %x, %y %b = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %a, <2 x i64> ) ret <2 x i64> %b @@ -34,24 +45,40 @@ define <1 x i64> @xar_v1i64(<1 x i64> %a, <1 x i64> %b) { ; NOSHA3-NEXT: shl d0, d1, #1 ; NOSHA3-NEXT: usra d0, d1, #63 ; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_v1i64: +; SVE2: // %bb.0: +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1 +; SVE2-NEXT: xar z0.d, z0.d, z1.d, #63 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret %v.val = xor <1 x i64> %a, %b %fshl = tail call <1 x i64> @llvm.fshl.v1i64(<1 x i64> %v.val, <1 x i64> %v.val, <1 x i64> splat (i64 1)) ret <1 x i64> %fshl } -define <2 x i64> @xar_instead_of_or1(<2 x i64> %r) { -; SHA3-LABEL: xar_instead_of_or1: +define <2 x i64> @xar_instead_of_or_v2i64(<2 x i64> %r) { +; SHA3-LABEL: xar_instead_of_or_v2i64: ; SHA3: // %bb.0: // %entry ; SHA3-NEXT: movi v1.2d, #0000000000000000 ; SHA3-NEXT: xar v0.2d, v0.2d, v1.2d, #39 ; SHA3-NEXT: ret ; -; NOSHA3-LABEL: xar_instead_of_or1: +; NOSHA3-LABEL: xar_instead_of_or_v2i64: ; NOSHA3: // %bb.0: // %entry ; NOSHA3-NEXT: shl v1.2d, v0.2d, #25 ; NOSHA3-NEXT: usra v1.2d, v0.2d, #39 ; NOSHA3-NEXT: mov v0.16b, v1.16b ; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v2i64: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE2-NEXT: xar z0.d, z0.d, z1.d, #39 +; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2-NEXT: ret entry: %or = call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %r, <2 x i64> %r, <2 x i64> splat (i64 25)) ret <2 x i64> %or @@ -72,67 +99,266 @@ define <1 x i64> @xar_instead_of_or_v1i64(<1 x i64> %v.val) { ; NOSHA3-NEXT: usra d1, d0, #63 ; NOSHA3-NEXT: fmov d0, d1 ; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v1i64: +; SVE2: // %bb.0: +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: xar z0.d, z0.d, z1.d, #63 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret %fshl = tail call <1 x i64> @llvm.fshl.v1i64(<1 x i64> %v.val, <1 x i64> %v.val, <1 x i64> splat (i64 1)) ret <1 x i64> %fshl } -define <4 x i32> @xar_instead_of_or2(<4 x i32> %r) { -; SHA3-LABEL: xar_instead_of_or2: +define <4 x i32> @xar_instead_of_or_v4i32(<4 x i32> %r) { +; SHA3-LABEL: xar_instead_of_or_v4i32: ; SHA3: // %bb.0: // %entry ; SHA3-NEXT: shl v1.4s, v0.4s, #25 ; SHA3-NEXT: usra v1.4s, v0.4s, #7 ; SHA3-NEXT: mov v0.16b, v1.16b ; SHA3-NEXT: ret ; -; NOSHA3-LABEL: xar_instead_of_or2: +; NOSHA3-LABEL: xar_instead_of_or_v4i32: ; NOSHA3: // %bb.0: // %entry ; NOSHA3-NEXT: shl v1.4s, v0.4s, #25 ; NOSHA3-NEXT: usra v1.4s, v0.4s, #7 ; NOSHA3-NEXT: mov v0.16b, v1.16b ; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v4i32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE2-NEXT: xar z0.s, z0.s, z1.s, #7 +; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2-NEXT: ret entry: %or = call <4 x i32> @llvm.fshl.v2i32(<4 x i32> %r, <4 x i32> %r, <4 x i32> splat (i32 25)) ret <4 x i32> %or } -define <8 x i16> @xar_instead_of_or3(<8 x i16> %r) { -; SHA3-LABEL: xar_instead_of_or3: +define <8 x i16> @xar_instead_of_or_v8i16(<8 x i16> %r) { +; SHA3-LABEL: xar_instead_of_or_v8i16: ; SHA3: // %bb.0: // %entry ; SHA3-NEXT: shl v1.8h, v0.8h, #9 ; SHA3-NEXT: usra v1.8h, v0.8h, #7 ; SHA3-NEXT: mov v0.16b, v1.16b ; SHA3-NEXT: ret ; -; NOSHA3-LABEL: xar_instead_of_or3: +; NOSHA3-LABEL: xar_instead_of_or_v8i16: ; NOSHA3: // %bb.0: // %entry ; NOSHA3-NEXT: shl v1.8h, v0.8h, #9 ; NOSHA3-NEXT: usra v1.8h, v0.8h, #7 ; NOSHA3-NEXT: mov v0.16b, v1.16b ; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v8i16: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE2-NEXT: xar z0.h, z0.h, z1.h, #7 +; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2-NEXT: ret entry: %or = call <8 x i16> @llvm.fshl.v2i16(<8 x i16> %r, <8 x i16> %r, <8 x i16> splat (i16 25)) ret <8 x i16> %or } -define <16 x i8> @xar_instead_of_or4(<16 x i8> %r) { -; SHA3-LABEL: xar_instead_of_or4: +define <16 x i8> @xar_instead_of_or_v16i8(<16 x i8> %r) { +; SHA3-LABEL: xar_instead_of_or_v16i8: ; SHA3: // %bb.0: // %entry ; SHA3-NEXT: add v1.16b, v0.16b, v0.16b ; SHA3-NEXT: usra v1.16b, v0.16b, #7 ; SHA3-NEXT: mov v0.16b, v1.16b ; SHA3-NEXT: ret ; -; NOSHA3-LABEL: xar_instead_of_or4: +; NOSHA3-LABEL: xar_instead_of_or_v16i8: ; NOSHA3: // %bb.0: // %entry ; NOSHA3-NEXT: add v1.16b, v0.16b, v0.16b ; NOSHA3-NEXT: usra v1.16b, v0.16b, #7 ; NOSHA3-NEXT: mov v0.16b, v1.16b ; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v16i8: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $q0 killed $q0 def $z0 +; SVE2-NEXT: xar z0.b, z0.b, z1.b, #7 +; SVE2-NEXT: // kill: def $q0 killed $q0 killed $z0 +; SVE2-NEXT: ret entry: %or = call <16 x i8> @llvm.fshl.v2i8(<16 x i8> %r, <16 x i8> %r, <16 x i8> splat (i8 25)) ret <16 x i8> %or } +/* 64 bit vectors */ + +define <2 x i32> @xar_v2i32(<2 x i32> %x, <2 x i32> %y) { +; SHA3-LABEL: xar_v2i32: +; SHA3: // %bb.0: // %entry +; SHA3-NEXT: eor v1.8b, v0.8b, v1.8b +; SHA3-NEXT: shl v0.2s, v1.2s, #25 +; SHA3-NEXT: usra v0.2s, v1.2s, #7 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: xar_v2i32: +; NOSHA3: // %bb.0: // %entry +; NOSHA3-NEXT: eor v1.8b, v0.8b, v1.8b +; NOSHA3-NEXT: shl v0.2s, v1.2s, #25 +; NOSHA3-NEXT: usra v0.2s, v1.2s, #7 +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_v2i32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1 +; SVE2-NEXT: xar z0.s, z0.s, z1.s, #7 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret +entry: + %a = xor <2 x i32> %x, %y + %b = call <2 x i32> @llvm.fshl(<2 x i32> %a, <2 x i32> %a, <2 x i32> ) + ret <2 x i32> %b +} + +define <2 x i32> @xar_instead_of_or_v2i32(<2 x i32> %r) { +; SHA3-LABEL: xar_instead_of_or_v2i32: +; SHA3: // %bb.0: // %entry +; SHA3-NEXT: shl v1.2s, v0.2s, #25 +; SHA3-NEXT: usra v1.2s, v0.2s, #7 +; SHA3-NEXT: fmov d0, d1 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: xar_instead_of_or_v2i32: +; NOSHA3: // %bb.0: // %entry +; NOSHA3-NEXT: shl v1.2s, v0.2s, #25 +; NOSHA3-NEXT: usra v1.2s, v0.2s, #7 +; NOSHA3-NEXT: fmov d0, d1 +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v2i32: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: xar z0.s, z0.s, z1.s, #7 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret +entry: + %or = call <2 x i32> @llvm.fshl(<2 x i32> %r, <2 x i32> %r, <2 x i32> splat (i32 25)) + ret <2 x i32> %or +} + +define <4 x i16> @xar_v4i16(<4 x i16> %x, <4 x i16> %y) { +; SHA3-LABEL: xar_v4i16: +; SHA3: // %bb.0: // %entry +; SHA3-NEXT: eor v1.8b, v0.8b, v1.8b +; SHA3-NEXT: shl v0.4h, v1.4h, #9 +; SHA3-NEXT: usra v0.4h, v1.4h, #7 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: xar_v4i16: +; NOSHA3: // %bb.0: // %entry +; NOSHA3-NEXT: eor v1.8b, v0.8b, v1.8b +; NOSHA3-NEXT: shl v0.4h, v1.4h, #9 +; NOSHA3-NEXT: usra v0.4h, v1.4h, #7 +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_v4i16: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1 +; SVE2-NEXT: xar z0.h, z0.h, z1.h, #7 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret +entry: + %a = xor <4 x i16> %x, %y + %b = call <4 x i16> @llvm.fshl(<4 x i16> %a, <4 x i16> %a, <4 x i16> splat (i16 25)) + ret <4 x i16> %b +} + +define <4 x i16> @xar_instead_of_or_v4i16(<4 x i16> %r) { +; SHA3-LABEL: xar_instead_of_or_v4i16: +; SHA3: // %bb.0: // %entry +; SHA3-NEXT: shl v1.4h, v0.4h, #9 +; SHA3-NEXT: usra v1.4h, v0.4h, #7 +; SHA3-NEXT: fmov d0, d1 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: xar_instead_of_or_v4i16: +; NOSHA3: // %bb.0: // %entry +; NOSHA3-NEXT: shl v1.4h, v0.4h, #9 +; NOSHA3-NEXT: usra v1.4h, v0.4h, #7 +; NOSHA3-NEXT: fmov d0, d1 +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v4i16: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: xar z0.h, z0.h, z1.h, #7 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret +entry: + %or = call <4 x i16> @llvm.fshl(<4 x i16> %r, <4 x i16> %r, <4 x i16> splat (i16 25)) + ret <4 x i16> %or +} + +define <8 x i8> @xar_v8i8(<8 x i8> %x, <8 x i8> %y) { +; SHA3-LABEL: xar_v8i8: +; SHA3: // %bb.0: // %entry +; SHA3-NEXT: eor v1.8b, v0.8b, v1.8b +; SHA3-NEXT: add v0.8b, v1.8b, v1.8b +; SHA3-NEXT: usra v0.8b, v1.8b, #7 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: xar_v8i8: +; NOSHA3: // %bb.0: // %entry +; NOSHA3-NEXT: eor v1.8b, v0.8b, v1.8b +; NOSHA3-NEXT: add v0.8b, v1.8b, v1.8b +; NOSHA3-NEXT: usra v0.8b, v1.8b, #7 +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_v8i8: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: // kill: def $d1 killed $d1 def $z1 +; SVE2-NEXT: xar z0.b, z0.b, z1.b, #7 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret +entry: + %a = xor <8 x i8> %x, %y + %b = call <8 x i8> @llvm.fshl(<8 x i8> %a, <8 x i8> %a, <8 x i8> splat (i8 25)) + ret <8 x i8> %b +} + +define <8 x i8> @xar_instead_of_or_v8i8(<8 x i8> %r) { +; SHA3-LABEL: xar_instead_of_or_v8i8: +; SHA3: // %bb.0: // %entry +; SHA3-NEXT: add v1.8b, v0.8b, v0.8b +; SHA3-NEXT: usra v1.8b, v0.8b, #7 +; SHA3-NEXT: fmov d0, d1 +; SHA3-NEXT: ret +; +; NOSHA3-LABEL: xar_instead_of_or_v8i8: +; NOSHA3: // %bb.0: // %entry +; NOSHA3-NEXT: add v1.8b, v0.8b, v0.8b +; NOSHA3-NEXT: usra v1.8b, v0.8b, #7 +; NOSHA3-NEXT: fmov d0, d1 +; NOSHA3-NEXT: ret +; +; SVE2-LABEL: xar_instead_of_or_v8i8: +; SVE2: // %bb.0: // %entry +; SVE2-NEXT: movi v1.2d, #0000000000000000 +; SVE2-NEXT: // kill: def $d0 killed $d0 def $z0 +; SVE2-NEXT: xar z0.b, z0.b, z1.b, #7 +; SVE2-NEXT: // kill: def $d0 killed $d0 killed $z0 +; SVE2-NEXT: ret +entry: + %or = call <8 x i8> @llvm.fshl(<8 x i8> %r, <8 x i8> %r, <8 x i8> splat (i8 25)) + ret <8 x i8> %or +} + declare <2 x i64> @llvm.fshl.v2i64(<2 x i64>, <2 x i64>, <2 x i64>) declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>) declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>) From 2efff47363f18966cd37461323b5db5418183534 Mon Sep 17 00:00:00 2001 From: Thurston Dang Date: Wed, 11 Jun 2025 22:43:06 -0700 Subject: [PATCH 188/851] [NFCI][msan] Show that shadow for partially undefined constant vectors is computed as fully initialized (#143823) This happens because `getShadow(Value *V)` has a special case for fully undefined/poisoned values, but partially undefined values fall-through and are given a clean shadow. This leads to false negatives (no false positives). Note: MSan correctly handles InsertElementInst, but the shadow of the initial constant vector may still be wrong and be propagated. Showing that the same approximation happens for other composite types is left as an exercise for the reader. --- .../Instrumentation/MemorySanitizer.cpp | 4 + .../MemorySanitizer/partial-poison.ll | 78 +++++++++++++++++++ 2 files changed, 82 insertions(+) create mode 100644 llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp index c2315d5de7041..d3c6a7151ec37 100644 --- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -2085,6 +2085,10 @@ struct MemorySanitizerVisitor : public InstVisitor { assert(ShadowPtr && "Could not find shadow for an argument"); return ShadowPtr; } + + // TODO: Partially undefined vectors are handled by the fall-through case + // below (see partial-poison.ll); this causes false negatives. + // For everything else the shadow is zero. return getCleanShadow(V); } diff --git a/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll b/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll new file mode 100644 index 0000000000000..5164441c17e10 --- /dev/null +++ b/llvm/test/Instrumentation/MemorySanitizer/partial-poison.ll @@ -0,0 +1,78 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt < %s -S -passes='msan' 2>&1 | FileCheck %s +; +; Test case to show that MSan computes shadows for partially poisoned vectors +; as fully initialized, resulting in false negatives. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define <2 x i64> @left_poison(ptr %add.ptr) sanitize_memory { +; CHECK-LABEL: define <2 x i64> @left_poison( +; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> +; + ret <2 x i64> +} + +define <2 x i64> @right_poison(ptr %add.ptr) sanitize_memory { +; CHECK-LABEL: define <2 x i64> @right_poison( +; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> +; + ret <2 x i64> +} + +define <2 x i64> @full_poison(ptr %add.ptr) sanitize_memory { +; CHECK-LABEL: define <2 x i64> @full_poison( +; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: store <2 x i64> splat (i64 -1), ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> poison +; + ret <2 x i64> +} + +define <2 x i64> @no_poison_or_undef(ptr %add.ptr) sanitize_memory { +; CHECK-LABEL: define <2 x i64> @no_poison_or_undef( +; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> splat (i64 42) +; + ret <2 x i64> +} + +define <2 x i64> @left_undef(ptr %add.ptr) sanitize_memory { +; CHECK-LABEL: define <2 x i64> @left_undef( +; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> +; + ret <2 x i64> +} + +define <2 x i64> @right_undef(ptr %add.ptr) sanitize_memory { +; CHECK-LABEL: define <2 x i64> @right_undef( +; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: store <2 x i64> zeroinitializer, ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> +; + ret <2 x i64> +} + +define <2 x i64> @full_undef(ptr %add.ptr) sanitize_memory { +; CHECK-LABEL: define <2 x i64> @full_undef( +; CHECK-SAME: ptr [[ADD_PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.donothing() +; CHECK-NEXT: store <2 x i64> splat (i64 -1), ptr @__msan_retval_tls, align 8 +; CHECK-NEXT: ret <2 x i64> undef +; + ret <2 x i64> +} From bec85f3b187f57713e01191381c88134e122bd35 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Storsj=C3=B6?= Date: Thu, 12 Jun 2025 08:58:26 +0300 Subject: [PATCH 189/851] [LLD] [COFF] [test] Readd lto-late-arm.ll (#143494) This testcase was removed in 4cafd28b7dd92080103d11cccc78d9a2f01e1242, as a082f665f85b1002ab22af263eeafceca5288657 had made it no longer trigger the error that it was supposed to do. (Because the latter of those two commits makes the symbol "__rt_sdiv" be included among the potential libcalls listed by lto::LTO::getRuntimeLibcallSymbols().) Readd the test as a positive test, making sure that such libcalls can get linked. We do have preexisting test coverage for LTO libcalls overall in libcall-archive.ll, but readd this test to cover specifically the ARM division helper functions as well. --- lld/test/COFF/lto-late-arm.ll | 38 +++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 lld/test/COFF/lto-late-arm.ll diff --git a/lld/test/COFF/lto-late-arm.ll b/lld/test/COFF/lto-late-arm.ll new file mode 100644 index 0000000000000..1070fc52a0136 --- /dev/null +++ b/lld/test/COFF/lto-late-arm.ll @@ -0,0 +1,38 @@ +; REQUIRES: arm + +;; A bitcode file can generate undefined references to symbols that weren't +;; listed as undefined on the bitcode file itself, when lowering produces +;; calls to e.g. builtin helper functions. Ideally all those functions are +;; listed by lto::LTO::getRuntimeLibcallSymbols(), then we successfully +;; can link cases when the helper functions are provided as bitcode too. +;; (In practice, compiler-rt builtins are always compiled with -fno-lto, so +;; this shouldn't really happen anyway.) + +; RUN: rm -rf %t.dir +; RUN: split-file %s %t.dir +; RUN: llvm-as %t.dir/main.ll -o %t.main.obj +; RUN: llvm-as %t.dir/sdiv.ll -o %t.sdiv.obj +; RUN: llvm-ar rcs %t.sdiv.lib %t.sdiv.obj + +; RUN: lld-link /entry:entry %t.main.obj %t.sdiv.lib /out:%t.exe /subsystem:console + +;--- main.ll +target datalayout = "e-m:w-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7-w64-windows-gnu" + +@num = dso_local global i32 100 + +define dso_local arm_aapcs_vfpcc i32 @entry(i32 %param) { +entry: + %0 = load i32, ptr @num + %div = sdiv i32 %0, %param + ret i32 %div +} +;--- sdiv.ll +target datalayout = "e-m:w-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" +target triple = "thumbv7-w64-windows-gnu" + +define dso_local arm_aapcs_vfpcc void @__rt_sdiv() { +entry: + ret void +} From 9d491bc602c2d9730cb42fe25f0753471a3af389 Mon Sep 17 00:00:00 2001 From: David Green Date: Thu, 12 Jun 2025 07:03:09 +0100 Subject: [PATCH 190/851] [AArch64][GlobalISel] Enable extract_vec_elt_combines postlegalization. --- llvm/lib/Target/AArch64/AArch64Combine.td | 2 +- .../AArch64/vec-combine-compare-to-bitmask.ll | 51 +++++++------------ 2 files changed, 18 insertions(+), 35 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td index 571e2692cbfff..ca09598464d13 100644 --- a/llvm/lib/Target/AArch64/AArch64Combine.td +++ b/llvm/lib/Target/AArch64/AArch64Combine.td @@ -361,7 +361,7 @@ def AArch64PostLegalizerCombiner ptr_add_immed_chain, overlapping_and, split_store_zero_128, undef_combines, select_to_minmax, or_to_bsp, combine_concat_vector, - commute_constant_to_rhs, + commute_constant_to_rhs, extract_vec_elt_combines, push_freeze_to_prevent_poison_from_propagating, combine_mul_cmlt, combine_use_vector_truncate, extmultomull]> { } diff --git a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll index 77483ebb2235c..d6d323530946e 100644 --- a/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll +++ b/llvm/test/CodeGen/AArch64/vec-combine-compare-to-bitmask.ll @@ -596,23 +596,15 @@ define i4 @convert_to_bitmask_4xi8(<4 x i8> %vec) { ; CHECK-GI-NEXT: mov.b v1[3], w8 ; CHECK-GI-NEXT: cmeq.8b v0, v0, v1 ; CHECK-GI-NEXT: mvn.8b v0, v0 -; CHECK-GI-NEXT: umov.b w8, v0[0] -; CHECK-GI-NEXT: umov.b w9, v0[1] -; CHECK-GI-NEXT: fmov s1, w8 -; CHECK-GI-NEXT: umov.b w8, v0[2] -; CHECK-GI-NEXT: mov.s v1[1], w9 -; CHECK-GI-NEXT: umov.b w9, v0[3] -; CHECK-GI-NEXT: mov.s v1[2], w8 -; CHECK-GI-NEXT: mov.s v1[3], w9 -; CHECK-GI-NEXT: mov.s w8, v1[1] -; CHECK-GI-NEXT: mov.s w9, v1[2] -; CHECK-GI-NEXT: fmov w11, s1 -; CHECK-GI-NEXT: mov.s w10, v1[3] +; CHECK-GI-NEXT: umov.b w8, v0[1] +; CHECK-GI-NEXT: umov.b w9, v0[0] +; CHECK-GI-NEXT: umov.b w10, v0[2] +; CHECK-GI-NEXT: umov.b w11, v0[3] ; CHECK-GI-NEXT: and w8, w8, #0x1 -; CHECK-GI-NEXT: bfi w11, w8, #1, #31 -; CHECK-GI-NEXT: and w8, w9, #0x1 -; CHECK-GI-NEXT: and w9, w10, #0x1 -; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: bfi w9, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w10, #0x1 +; CHECK-GI-NEXT: orr w8, w9, w8, lsl #2 +; CHECK-GI-NEXT: and w9, w11, #0x1 ; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 ; CHECK-GI-NEXT: strb w8, [sp, #15] ; CHECK-GI-NEXT: and w0, w8, #0xff @@ -871,28 +863,19 @@ define i6 @no_combine_illegal_num_elements(<6 x i32> %vec) { ; CHECK-GI-NEXT: cmtst.4s v1, v1, v1 ; CHECK-GI-NEXT: mov.s w8, v1[1] ; CHECK-GI-NEXT: mov.s w9, v1[2] +; CHECK-GI-NEXT: fmov w11, s1 ; CHECK-GI-NEXT: mov.s w10, v1[3] -; CHECK-GI-NEXT: mov.h v1[1], w8 -; CHECK-GI-NEXT: mov.s w8, v0[1] -; CHECK-GI-NEXT: mov.h v1[2], w9 -; CHECK-GI-NEXT: mov.h v1[3], w10 -; CHECK-GI-NEXT: mov.h v1[4], v0[0] -; CHECK-GI-NEXT: mov.h v1[5], w8 -; CHECK-GI-NEXT: umov.h w8, v1[1] -; CHECK-GI-NEXT: umov.h w9, v1[0] -; CHECK-GI-NEXT: umov.h w10, v1[2] -; CHECK-GI-NEXT: umov.h w11, v1[3] ; CHECK-GI-NEXT: and w8, w8, #0x1 -; CHECK-GI-NEXT: bfi w9, w8, #1, #31 -; CHECK-GI-NEXT: and w8, w10, #0x1 -; CHECK-GI-NEXT: umov.h w10, v1[4] -; CHECK-GI-NEXT: orr w8, w9, w8, lsl #2 -; CHECK-GI-NEXT: and w9, w11, #0x1 -; CHECK-GI-NEXT: umov.h w11, v1[5] -; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: bfi w11, w8, #1, #31 +; CHECK-GI-NEXT: and w8, w9, #0x1 ; CHECK-GI-NEXT: and w9, w10, #0x1 +; CHECK-GI-NEXT: mov.s w10, v0[1] +; CHECK-GI-NEXT: orr w8, w11, w8, lsl #2 +; CHECK-GI-NEXT: orr w8, w8, w9, lsl #3 +; CHECK-GI-NEXT: fmov w9, s0 +; CHECK-GI-NEXT: and w9, w9, #0x1 ; CHECK-GI-NEXT: orr w8, w8, w9, lsl #4 -; CHECK-GI-NEXT: and w9, w11, #0x1 +; CHECK-GI-NEXT: and w9, w10, #0x1 ; CHECK-GI-NEXT: orr w8, w8, w9, lsl #5 ; CHECK-GI-NEXT: and w8, w8, #0x3f ; CHECK-GI-NEXT: strb w8, [sp, #15] From 3f0cf742ac4eb3437450f8f263081ea951248851 Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Thu, 12 Jun 2025 14:40:38 +0800 Subject: [PATCH 191/851] [C++20] [Modules] [Reduced BMI] Don't write specializations with local args Close https://github.com/llvm/llvm-project/issues/119947 As discussed in the above thread, we shouldn't write specializations with local args in reduced BMI. Since users can't find such specializations any way. --- clang/lib/Serialization/ASTWriterDecl.cpp | 45 +++++++++++++++++++ clang/test/Modules/pr119947.cppm | 54 +++++++++++++++++++++++ 2 files changed, 99 insertions(+) create mode 100644 clang/test/Modules/pr119947.cppm diff --git a/clang/lib/Serialization/ASTWriterDecl.cpp b/clang/lib/Serialization/ASTWriterDecl.cpp index 8f82324a27535..052cb5a253bf7 100644 --- a/clang/lib/Serialization/ASTWriterDecl.cpp +++ b/clang/lib/Serialization/ASTWriterDecl.cpp @@ -221,6 +221,48 @@ namespace clang { Record.AddDeclRef(F.second); } + template bool shouldSkipWritingSpecializations(T *Spec) { + // Now we will only avoid writing specializations if we're generating + // reduced BMI. + if (!GeneratingReducedBMI) + return false; + + assert((isa(Spec))); + + ArrayRef Args; + if (auto *CTSD = dyn_cast(Spec)) + Args = CTSD->getTemplateArgs().asArray(); + else if (auto *VTSD = dyn_cast(Spec)) + Args = VTSD->getTemplateArgs().asArray(); + else + Args = cast(Spec) + ->getTemplateSpecializationArgs() + ->asArray(); + + // If there is any template argument is TULocal, we can avoid writing the + // specialization since the consumers of reduced BMI won't get the + // specialization anyway. + for (const TemplateArgument &TA : Args) { + switch (TA.getKind()) { + case TemplateArgument::Type: { + Linkage L = TA.getAsType()->getLinkage(); + if (!isExternallyVisible(L)) + return true; + break; + } + case TemplateArgument::Declaration: + if (!TA.getAsDecl()->isExternallyVisible()) + return true; + break; + default: + break; + } + } + + return false; + } + /// Add to the record the first template specialization from each module /// file that provides a declaration of D. We store the DeclId and an /// ODRHash of the template arguments of D which should provide enough @@ -235,6 +277,9 @@ namespace clang { CollectFirstDeclFromEachModule(D, /*IncludeLocal*/ true, Firsts); for (const auto &F : Firsts) { + if (shouldSkipWritingSpecializations(F.second)) + continue; + if (isa(F.second)) PartialSpecsInMap.push_back(F.second); diff --git a/clang/test/Modules/pr119947.cppm b/clang/test/Modules/pr119947.cppm new file mode 100644 index 0000000000000..40de2cad3c0d7 --- /dev/null +++ b/clang/test/Modules/pr119947.cppm @@ -0,0 +1,54 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/a.cppm -emit-reduced-module-interface -o %t/a.pcm +// RUN: %clang_cc1 -std=c++20 %t/use.cpp -fprebuilt-module-path=%t -emit-llvm -o - + + +//--- a.cppm +export module a; + +struct a_inner { + ~a_inner() { + } + void f(auto) { + } +}; + +export template +struct a { + a() { + struct local {}; + inner.f(local()); + } +private: + a_inner inner; +}; + + +namespace { + +struct s { +}; + +} // namespace + +void f() { + a x; +} + +//--- use.cpp +import a; + +namespace { + +struct s { +}; + +} // namespace + +void g() { + a x; +} + From 6157028fea93ff14af18b173dd01eb431cfb6aef Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 12 Jun 2025 09:19:50 +0200 Subject: [PATCH 192/851] [BasicAA][ValueTracking] Increase depth for underlying object search (#143714) This depth limits a linear search (rather than the usual potentially exponential one) and is not particularly important for compile-time in practice. The change in #137297 is going to increase the length of GEP chains, so I'd like to increase this limit a bit to reduce the chance of regressions (https://github.com/dtcxzyw/llvm-opt-benchmark/pull/2419 showed a 13% increase in SearchLimitReached). There is no particular significance to the new value of 10. Compile-time is neutral. --- llvm/include/llvm/Analysis/ValueTracking.h | 2 +- .../BasicAA/gep-decomposition-limit.ll | 38 +++++++++++-------- .../underlying-objects-2.ll | 5 ++- .../inline-noalias-unidentify-object.ll | 22 +++++++---- 4 files changed, 42 insertions(+), 25 deletions(-) diff --git a/llvm/include/llvm/Analysis/ValueTracking.h b/llvm/include/llvm/Analysis/ValueTracking.h index 32ab9733d13c9..e215c90b5a72a 100644 --- a/llvm/include/llvm/Analysis/ValueTracking.h +++ b/llvm/include/llvm/Analysis/ValueTracking.h @@ -47,7 +47,7 @@ constexpr unsigned MaxAnalysisRecursionDepth = 6; /// The max limit of the search depth in DecomposeGEPExpression() and /// getUnderlyingObject(). -constexpr unsigned MaxLookupSearchDepth = 6; +constexpr unsigned MaxLookupSearchDepth = 10; /// Determine which bits of V are known to be either zero or one and return /// them in the KnownZero/KnownOne bit sets. diff --git a/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll b/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll index 23a96ebca8485..a256ececbe565 100644 --- a/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll +++ b/llvm/test/Analysis/BasicAA/gep-decomposition-limit.ll @@ -2,22 +2,22 @@ ; CHECK-LABEL: Function: test ;; Before limit: -; CHECK-DAG: MustAlias: i8* %gep.add5, i8* %gep.inc5 -; CHECK-DAG: NoAlias: i8* %gep.inc3, i8* %gep.inc5 -; CHECK-DAG: NoAlias: i8* %gep.inc4, i8* %gep.inc5 +; CHECK-DAG: MustAlias: i8* %gep.add9, i8* %gep.inc9 +; CHECK-DAG: NoAlias: i8* %gep.inc7, i8* %gep.inc9 +; CHECK-DAG: NoAlias: i8* %gep.inc8, i8* %gep.inc9 ;; At limit: -; CHECK-DAG: MustAlias: i8* %gep.add6, i8* %gep.inc6 -; CHECK-DAG: NoAlias: i8* %gep.inc4, i8* %gep.inc6 -; CHECK-DAG: NoAlias: i8* %gep.inc5, i8* %gep.inc6 +; CHECK-DAG: MustAlias: i8* %gep.add10, i8* %gep.inc10 +; CHECK-DAG: NoAlias: i8* %gep.inc10, i8* %gep.inc8 +; CHECK-DAG: NoAlias: i8* %gep.inc10, i8* %gep.inc9 ;; After limit: -; CHECK-DAG: MayAlias: i8* %gep.add7, i8* %gep.inc7 -; CHECK-DAG: MayAlias: i8* %gep.inc5, i8* %gep.inc7 -; CHECK-DAG: NoAlias: i8* %gep.inc6, i8* %gep.inc7 +; CHECK-DAG: MayAlias: i8* %gep.add11, i8* %gep.inc11 +; CHECK-DAG: MayAlias: i8* %gep.inc11, i8* %gep.inc9 +; CHECK-DAG: NoAlias: i8* %gep.inc10, i8* %gep.inc11 define void @test(ptr %base) { - %gep.add5 = getelementptr i8, ptr %base, i64 5 - %gep.add6 = getelementptr i8, ptr %base, i64 6 - %gep.add7 = getelementptr i8, ptr %base, i64 7 + %gep.add9 = getelementptr i8, ptr %base, i64 9 + %gep.add10 = getelementptr i8, ptr %base, i64 10 + %gep.add11 = getelementptr i8, ptr %base, i64 11 %gep.inc1 = getelementptr i8, ptr %base, i64 1 %gep.inc2 = getelementptr i8, ptr %gep.inc1, i64 1 @@ -26,15 +26,23 @@ define void @test(ptr %base) { %gep.inc5 = getelementptr i8, ptr %gep.inc4, i64 1 %gep.inc6 = getelementptr i8, ptr %gep.inc5, i64 1 %gep.inc7 = getelementptr i8, ptr %gep.inc6, i64 1 + %gep.inc8 = getelementptr i8, ptr %gep.inc7, i64 1 + %gep.inc9 = getelementptr i8, ptr %gep.inc8, i64 1 + %gep.inc10 = getelementptr i8, ptr %gep.inc9, i64 1 + %gep.inc11 = getelementptr i8, ptr %gep.inc10, i64 1 - load i8, ptr %gep.add5 - load i8, ptr %gep.add6 - load i8, ptr %gep.add7 + load i8, ptr %gep.add9 + load i8, ptr %gep.add10 + load i8, ptr %gep.add11 load i8, ptr %gep.inc3 load i8, ptr %gep.inc4 load i8, ptr %gep.inc5 load i8, ptr %gep.inc6 load i8, ptr %gep.inc7 + load i8, ptr %gep.inc8 + load i8, ptr %gep.inc9 + load i8, ptr %gep.inc10 + load i8, ptr %gep.inc11 ret void } diff --git a/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll b/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll index abfdff79dc113..1d3512128678e 100644 --- a/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll +++ b/llvm/test/Analysis/LoopAccessAnalysis/underlying-objects-2.ll @@ -127,9 +127,12 @@ for_j.body: %gepB7 = getelementptr inbounds i8, ptr %gepB6, i64 0 %gepB8 = getelementptr inbounds i8, ptr %gepB7, i64 0 %gepB9 = getelementptr inbounds i8, ptr %gepB8, i64 0 + %gepB10 = getelementptr inbounds i8, ptr %gepB9, i64 0 + %gepB11 = getelementptr inbounds i8, ptr %gepB10, i64 0 + %gepB12 = getelementptr inbounds i8, ptr %gepB11, i64 0 %loadPrev = load i8, ptr %gepPrev, align 1 - %loadB = load i8, ptr %gepB9, align 1 + %loadB = load i8, ptr %gepB12, align 1 %mul = mul i8 %loadPrev, %loadB diff --git a/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll b/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll index 54e9ee0918ae8..b7ba1b32238a7 100644 --- a/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll +++ b/llvm/test/Transforms/Inline/inline-noalias-unidentify-object.ll @@ -3,15 +3,18 @@ define i32 @caller(ptr %p) { ; CHECK-LABEL: define i32 @caller(ptr %p) { ; CHECK-NEXT: call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]]) -; CHECK-NEXT: [[P_8_I:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 8 -; CHECK-NEXT: [[V_I:%.*]] = load i32, ptr [[P_8_I]], align 4, !alias.scope !0 -; CHECK-NEXT: [[P_1_I:%.*]] = getelementptr i8, ptr [[P]], i64 1 +; CHECK-NEXT: [[P_11_I:%.*]] = getelementptr i8, ptr %p, i64 11 +; CHECK-NEXT: [[V_I:%.*]] = load i32, ptr [[P_11_I]], align 4, !alias.scope !0 +; CHECK-NEXT: [[P_1_I:%.*]] = getelementptr i8, ptr %p, i64 1 ; CHECK-NEXT: [[P_2_I:%.*]] = getelementptr i8, ptr [[P_1_I]], i64 1 ; CHECK-NEXT: [[P_3_I:%.*]] = getelementptr i8, ptr [[P_2_I]], i64 1 ; CHECK-NEXT: [[P_4_I:%.*]] = getelementptr i8, ptr [[P_3_I]], i64 1 ; CHECK-NEXT: [[P_5_I:%.*]] = getelementptr i8, ptr [[P_4_I]], i64 1 ; CHECK-NEXT: [[P_6_I:%.*]] = getelementptr i8, ptr [[P_5_I]], i64 1 -; CHECK-NEXT: [[P_7_I:%.*]] = getelementptr i8, ptr [[P_6_I]], i64 1 +; CHECK-NEXT: [[P_7_I1:%.*]] = getelementptr i8, ptr [[P_6_I]], i64 1 +; CHECK-NEXT: [[P_8_I:%.*]] = getelementptr i8, ptr [[P_7_I1]], i64 1 +; CHECK-NEXT: [[P_9_I:%.*]] = getelementptr i8, ptr [[P_8_I]], i64 1 +; CHECK-NEXT: [[P_7_I:%.*]] = getelementptr i8, ptr [[P_9_I]], i64 1 ; CHECK-NEXT: [[P_8_ALIAS_I:%.*]] = getelementptr i8, ptr [[P_7_I]], i64 1 ; CHECK-NEXT: store i32 42, ptr [[P_8_ALIAS_I]], align 4 ; CHECK-NEXT: ret i32 [[V_I]] @@ -21,8 +24,8 @@ define i32 @caller(ptr %p) { } define internal i32 @callee(ptr noalias %p) { - %p.8 = getelementptr i8, ptr %p, i64 8 - %v = load i32, ptr %p.8 + %p.11 = getelementptr i8, ptr %p, i64 11 + %v = load i32, ptr %p.11 %p.1 = getelementptr i8, ptr %p, i64 1 %p.2 = getelementptr i8, ptr %p.1, i64 1 %p.3 = getelementptr i8, ptr %p.2, i64 1 @@ -30,7 +33,10 @@ define internal i32 @callee(ptr noalias %p) { %p.5 = getelementptr i8, ptr %p.4, i64 1 %p.6 = getelementptr i8, ptr %p.5, i64 1 %p.7 = getelementptr i8, ptr %p.6, i64 1 - %p.8.alias = getelementptr i8, ptr %p.7, i64 1 - store i32 42, ptr %p.8.alias + %p.8 = getelementptr i8, ptr %p.7, i64 1 + %p.9 = getelementptr i8, ptr %p.8, i64 1 + %p.10 = getelementptr i8, ptr %p.9, i64 1 + %p.11.alias = getelementptr i8, ptr %p.10, i64 1 + store i32 42, ptr %p.11.alias ret i32 %v } From 77062244ed56be61aecda28d6fede3432545f741 Mon Sep 17 00:00:00 2001 From: Mikael Holmen Date: Thu, 12 Jun 2025 09:29:40 +0200 Subject: [PATCH 193/851] Fix two instances of -Wparentheses warnings [NFC] Add parentheses around the assert conditions. Without this gcc warned like ../lib/Target/AMDGPU/GCNSchedStrategy.cpp:2250: warning: suggest parentheses around '&&' within '||' [-Wparentheses] 2250 | NewMI != RegionBounds.second && "cannot remove at region end"); and ../../clang/lib/Sema/SemaOverload.cpp:11326:39: warning: suggest parentheses around '&&' within '||' [-Wparentheses] 11326 | DeferredCandidatesCount == 0 && | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~^~ 11327 | "Unexpected deferred template candidates"); | ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ --- clang/lib/Sema/SemaOverload.cpp | 6 +++--- llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp index cf455f4588de3..89e86f49a3ca8 100644 --- a/clang/lib/Sema/SemaOverload.cpp +++ b/clang/lib/Sema/SemaOverload.cpp @@ -11322,9 +11322,9 @@ OverloadingResult OverloadCandidateSet::BestViableFunction(Sema &S, SourceLocation Loc, iterator &Best) { - assert(shouldDeferTemplateArgumentDeduction(S.getLangOpts()) || - DeferredCandidatesCount == 0 && - "Unexpected deferred template candidates"); + assert((shouldDeferTemplateArgumentDeduction(S.getLangOpts()) || + DeferredCandidatesCount == 0) && + "Unexpected deferred template candidates"); bool TwoPhaseResolution = DeferredCandidatesCount != 0 && !ResolutionByPerfectCandidateIsDisabled; diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 0f80462050cda..7165cf89ca45d 100644 --- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -2246,8 +2246,8 @@ void PreRARematStage::finalizeGCNSchedStage() { void GCNScheduleDAGMILive::updateRegionBoundaries( RegionBoundaries &RegionBounds, MachineBasicBlock::iterator MI, MachineInstr *NewMI) { - assert(!NewMI || - NewMI != RegionBounds.second && "cannot remove at region end"); + assert((!NewMI || NewMI != RegionBounds.second) && + "cannot remove at region end"); if (RegionBounds.first == RegionBounds.second) { assert(NewMI && "cannot remove from an empty region"); From 2d35b568ef949717e35df664d4d9352eddbffbfd Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 12 Jun 2025 09:27:24 +0100 Subject: [PATCH 194/851] [X86] bsf.ll - add icmp_ne coverage to bsf passthrough tests --- llvm/test/CodeGen/X86/bsf.ll | 56 ++++++++++++++++++------------------ 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/llvm/test/CodeGen/X86/bsf.ll b/llvm/test/CodeGen/X86/bsf.ll index 58929115baf54..312f94c041235 100644 --- a/llvm/test/CodeGen/X86/bsf.ll +++ b/llvm/test/CodeGen/X86/bsf.ll @@ -38,13 +38,13 @@ define i8 @cmov_bsf8_undef(i8 %x, i8 %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: testb %al, %al -; X86-NEXT: je .LBB1_1 +; X86-NEXT: jne .LBB1_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; X86-NEXT: .LBB1_1: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -56,8 +56,8 @@ define i8 @cmov_bsf8_undef(i8 %x, i8 %y) nounwind { ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %1 = tail call i8 @llvm.cttz.i8(i8 %x, i1 true) - %2 = icmp eq i8 %x, 0 - %3 = select i1 %2, i8 %y, i8 %1 + %2 = icmp ne i8 %x, 0 + %3 = select i1 %2, i8 %1, i8 %y ret i8 %3 } @@ -66,14 +66,14 @@ define i16 @cmov_bsf16(i16 %x, i16 %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: testw %ax, %ax -; X86-NEXT: je .LBB2_1 +; X86-NEXT: jne .LBB2_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: orl $65536, %eax # imm = 0x10000 -; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; X86-NEXT: .LBB2_1: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orl $65536, %eax # imm = 0x10000 +; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl ; @@ -87,8 +87,8 @@ define i16 @cmov_bsf16(i16 %x, i16 %y) nounwind { ; X64-NEXT: # kill: def $ax killed $ax killed $eax ; X64-NEXT: retq %1 = tail call i16 @llvm.cttz.i16(i16 %x, i1 false) - %2 = icmp eq i16 %x, 0 - %3 = select i1 %2, i16 %y, i16 %1 + %2 = icmp ne i16 %x, 0 + %3 = select i1 %2, i16 %1, i16 %y ret i16 %3 } @@ -157,12 +157,12 @@ define i32 @cmov_bsf32_undef(i32 %x, i32 %y) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: testl %eax, %eax -; X86-NEXT: je .LBB5_1 +; X86-NEXT: jne .LBB5_1 ; X86-NEXT: # %bb.2: -; X86-NEXT: rep bsfl %eax, %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl ; X86-NEXT: .LBB5_1: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: rep bsfl %eax, %eax ; X86-NEXT: retl ; ; X64-LABEL: cmov_bsf32_undef: @@ -171,8 +171,8 @@ define i32 @cmov_bsf32_undef(i32 %x, i32 %y) nounwind { ; X64-NEXT: cmovel %esi, %eax ; X64-NEXT: retq %1 = tail call i32 @llvm.cttz.i32(i32 %x, i1 true) - %2 = icmp eq i32 %x, 0 - %3 = select i1 %2, i32 %y, i32 %1 + %2 = icmp ne i32 %x, 0 + %3 = select i1 %2, i32 %1, i32 %y ret i32 %3 } @@ -199,7 +199,7 @@ define i64 @cmov_bsf64(i64 %x, i64 %y) nounwind { ; X86-NEXT: movl $64, %eax ; X86-NEXT: orl %ecx, %esi ; X86-NEXT: jne .LBB6_7 -; X86-NEXT: .LBB6_6: +; X86-NEXT: .LBB6_6: # %cond.end ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: .LBB6_7: # %cond.end @@ -218,8 +218,8 @@ define i64 @cmov_bsf64(i64 %x, i64 %y) nounwind { ; X64-NEXT: cmoveq %rsi, %rax ; X64-NEXT: retq %1 = tail call i64 @llvm.cttz.i64(i64 %x, i1 false) - %2 = icmp eq i64 %x, 0 - %3 = select i1 %2, i64 %y, i64 %1 + %2 = icmp ne i64 %x, 0 + %3 = select i1 %2, i64 %1, i64 %y ret i64 %3 } @@ -375,10 +375,10 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind { ; X86-NEXT: orl %ebx, %ebp ; X86-NEXT: orl %edi, %ebp ; X86-NEXT: je .LBB9_11 -; X86-NEXT: # %bb.1: # %select.false.sink +; X86-NEXT: # %bb.1: # %select.true.sink ; X86-NEXT: testl %edx, %edx ; X86-NEXT: jne .LBB9_2 -; X86-NEXT: # %bb.3: # %select.false.sink +; X86-NEXT: # %bb.3: # %select.true.sink ; X86-NEXT: rep bsfl %ecx, %edi ; X86-NEXT: addl $32, %edi ; X86-NEXT: testl %ebx, %ebx @@ -402,20 +402,20 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind { ; X86-NEXT: rep bsfl %edx, %edi ; X86-NEXT: testl %ebx, %ebx ; X86-NEXT: jne .LBB9_5 -; X86-NEXT: .LBB9_6: # %select.false.sink +; X86-NEXT: .LBB9_6: # %select.true.sink ; X86-NEXT: rep bsfl %esi, %esi ; X86-NEXT: addl $32, %esi ; X86-NEXT: orl %ecx, %edx ; X86-NEXT: jne .LBB9_9 -; X86-NEXT: .LBB9_8: # %select.false.sink +; X86-NEXT: .LBB9_8: # %select.true.sink ; X86-NEXT: addl $64, %esi ; X86-NEXT: movl %esi, %edi -; X86-NEXT: .LBB9_9: # %select.false.sink +; X86-NEXT: .LBB9_9: # %select.true.sink ; X86-NEXT: movl %edi, (%eax) ; X86-NEXT: movl $0, 12(%eax) ; X86-NEXT: movl $0, 8(%eax) ; X86-NEXT: movl $0, 4(%eax) -; X86-NEXT: .LBB9_10: # %select.false.sink +; X86-NEXT: .LBB9_10: # %select.true.sink ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi ; X86-NEXT: popl %ebx @@ -427,7 +427,7 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind { ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: orq %rsi, %rax ; X64-NEXT: je .LBB9_2 -; X64-NEXT: # %bb.1: # %select.false.sink +; X64-NEXT: # %bb.1: # %select.true.sink ; X64-NEXT: rep bsfq %rdi, %rcx ; X64-NEXT: rep bsfq %rsi, %rax ; X64-NEXT: addq $64, %rax @@ -440,8 +440,8 @@ define i128 @cmov_bsf128_undef(i128 %x, i128 %y) nounwind { ; X64-NEXT: movq %rcx, %rdx ; X64-NEXT: retq %1 = tail call i128 @llvm.cttz.i128(i128 %x, i1 true) - %2 = icmp eq i128 %x, 0 - %3 = select i1 %2, i128 %y, i128 %1 + %2 = icmp ne i128 %x, 0 + %3 = select i1 %2, i128 %1, i128 %y ret i128 %3 } From 6e5a1423b752c66273bfcff35aaa8083075788a8 Mon Sep 17 00:00:00 2001 From: Ian Wood Date: Thu, 12 Jun 2025 01:28:27 -0700 Subject: [PATCH 195/851] [mlir] Reapply "Loosen restrictions on folding dynamic reshapes" (#142827) The original PR https://github.com/llvm/llvm-project/pull/137963 had a nvidia bot failure. This appears to be a flaky test because rerunning the build was successful. This change needs commit 6f2ba47 to fix incorrect usage of `getReassociationIndicesForCollapse`. Reverts llvm/llvm-project#142639 Co-authored-by: Artem Gindinson --- mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp | 372 +++++++++++++++--- .../Dialect/Linalg/simplify-pack-unpack.mlir | 4 +- mlir/test/Dialect/Tensor/canonicalize.mlir | 39 +- mlir/unittests/Dialect/Utils/CMakeLists.txt | 1 + .../Dialect/Utils/ReshapeOpsUtilsTest.cpp | 203 ++++++++++ 5 files changed, 560 insertions(+), 59 deletions(-) create mode 100644 mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp index 1a04d702e0559..3b1fdb69e8ef1 100644 --- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp +++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp @@ -10,6 +10,10 @@ #include "mlir/IR/AffineMap.h" #include "mlir/IR/Builders.h" +#include "mlir/IR/BuiltinTypeInterfaces.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Support/LogicalResult.h" #include #include @@ -28,67 +32,329 @@ mlir::getReassociationIndicesForReshape(ShapedType sourceType, return std::nullopt; } -std::optional> -mlir::getReassociationIndicesForCollapse(ArrayRef sourceShape, - ArrayRef targetShape) { - if (sourceShape.size() <= targetShape.size()) - return std::nullopt; - unsigned sourceDim = 0; - SmallVector reassociationMap; - reassociationMap.reserve(targetShape.size()); +namespace { +/// A simple struct to represent ReassociationIndices as an inclusive interval. +/// It's designed to be feasibly minimal, so the call sites should manage the +/// validity of the range manually. +struct ReassociationIndexRange { + /// FIXME: Signed type is used for consistency with ReassociationIndices. + /// We should consider refactoring all reassociation utilities to use unsigned + /// types. + int64_t leftIdx = 0, rightIdx = 0; + + /// Util for manual checks of the range's validity + LogicalResult verify() const { + return leftIdx >= 0 && (leftIdx <= rightIdx) ? success() : failure(); + } + + /// Checks range's containment within another range. Treats the edges + /// non-exclusively. + bool isInRange(const ReassociationIndexRange &outerRange) const { + return leftIdx >= outerRange.leftIdx && rightIdx <= outerRange.rightIdx; + } + + unsigned size() const { + assert(succeeded(verify())); + return rightIdx - leftIdx + 1; + } + bool containsSingleIndex() const { return size() == 1; } + + /// Collects indices that do not overlap between this and another range. + ReassociationIndices + getNonOverlappingIndicesWith(ReassociationIndexRange &rhs) const { + if (rightIdx < rhs.leftIdx) { + // The intervals do not overlap - concatenate the indices from both. + auto jointFullIndices = getFullIndices(); + jointFullIndices.append(rhs.getFullIndices()); + return jointFullIndices; + } + ReassociationIndices result; + // Handle the chunk left of the overlapping range. + int64_t leftStart = std::min(leftIdx, rhs.leftIdx); + int64_t leftEnd = std::max(leftIdx, rhs.leftIdx); + llvm::append_range(result, llvm::seq(leftStart, leftEnd)); + // Handle the chunk right of the overlapping range. Symmetrically, we should + // skip the edge of the overlap AND include the rightmost index. + int64_t rightStart = std::min(rightIdx, rhs.rightIdx) + 1; + int64_t rightEnd = std::max(rightIdx, rhs.rightIdx); + if (rightStart < rightEnd) + llvm::append_range(result, llvm::seq_inclusive(rightStart, rightEnd)); + return result; + } + + /// Converts the range into ReassociationIndices. + ReassociationIndices getFullIndices() const { + ReassociationIndices result; + for (int64_t idx = leftIdx; idx <= rightIdx; ++idx) { + result.push_back(idx); + } + return result; + } +}; +} // namespace + +/// Starting from `sourceStartIdx`, searches `sourceShape` for the first +/// sequence that can be collapsed into a dynamic dimension (at least one must +/// be present in the source). +/// By default, lazily returns once the first dynamic dimension has been found. +/// Setting `matchGreedily` as `true` will also mark all subsequent +/// source dimensions for collapsing into the target. +static FailureOr +findReassociationRangeForDynamicDim(ArrayRef sourceShape, + int64_t sourceStartIdx, + bool matchGreedily = false) { + const unsigned numSourceDims = sourceShape.size(); + ReassociationIndexRange sourceShapeAsRange{0, numSourceDims - 1}; + std::optional resultRange = std::nullopt; + + ReassociationIndexRange iterationRange{sourceStartIdx, sourceStartIdx}; + for (; iterationRange.isInRange(sourceShapeAsRange); + iterationRange.rightIdx++) { + int64_t sourceSize = sourceShape[iterationRange.rightIdx]; + if (sourceSize == ShapedType::kDynamic) { + resultRange = iterationRange; + break; + } + } + if (!resultRange) + return failure(); + if (matchGreedily) + resultRange->rightIdx = sourceShapeAsRange.rightIdx; + return *resultRange; +} - ReassociationIndices currIndices; +/// Starting from `sourceStartIdx`, searches `sourceShape` for the first +/// sequence of static dimensions such that their product matches `targetSize`. +/// By default, lazily returns once the product matches the target size. Setting +/// `matchGreedily` as `true` will append all neighboring unit dimensions +/// (dimensions of 1) to the match. +static FailureOr +findReassociationRangeForSize(ArrayRef sourceShape, + int64_t sourceStartIdx, int64_t targetSize, + bool matchGreedily = false) { + const unsigned numSourceDims = sourceShape.size(); + ReassociationIndexRange sourceShapeAsRange{0, numSourceDims - 1}; + std::optional resultRange = std::nullopt; + + ReassociationIndexRange iterationRange{sourceStartIdx, sourceStartIdx}; int64_t prodOfCollapsedDims = 1; - while (sourceDim < sourceShape.size()) { - unsigned targetDim = reassociationMap.size(); - // If we have mapped all the target dimensions stop and handle the remaining - // tail of size-1 dimensions explicitly. - if (targetDim == targetShape.size()) + while (iterationRange.isInRange(sourceShapeAsRange)) { + int64_t sourceSize = sourceShape[iterationRange.rightIdx]; + if (sourceSize == ShapedType::kDynamic) { + // Reassociation for a static dim cannot include a dynamic dim. Reset + // induction variables to essentially restart the loop from the next + // source dimension. + prodOfCollapsedDims = 1; + iterationRange = {iterationRange.rightIdx + 1, + iterationRange.rightIdx + 1}; + continue; + } + prodOfCollapsedDims *= sourceSize; + // If the target size has been exceeded without matching, we need to shift + // the range start right. From the start of the range, roll back the + // multiplication until the target size exceeds the product again. + while (prodOfCollapsedDims > targetSize && + !iterationRange.containsSingleIndex()) { + int64_t frontSourceSize = sourceShape[iterationRange.leftIdx]; + prodOfCollapsedDims /= frontSourceSize; + // Shrink the range rightwards + iterationRange.leftIdx++; + } + // We could've reached the target size with the current dimension, + // also as a result of the above shift to right. + if (prodOfCollapsedDims == targetSize) { + resultRange = iterationRange; break; + } + // Increment the iteration range + iterationRange.rightIdx++; + } + if (!resultRange) + return failure(); + if (matchGreedily) { + // We now want to collect all unit dimensions directly after the target + // product match. Advance the iterator to avoid OOB when the product match + // happens at the last element. + iterationRange.rightIdx++; + while (iterationRange.isInRange(sourceShapeAsRange) && + sourceShape[iterationRange.rightIdx] == 1) { + resultRange = iterationRange; + iterationRange.rightIdx++; + } + } + return *resultRange; +} - int64_t currTargetShape = targetShape[targetDim]; - while (sourceDim < (sourceShape.size() - 1) && - sourceShape[sourceDim] != ShapedType::kDynamic && - prodOfCollapsedDims * sourceShape[sourceDim] < currTargetShape) { - prodOfCollapsedDims *= sourceShape[sourceDim]; - currIndices.push_back(sourceDim++); +/// Attempts to find a valid collapsing reassociation of `sourceShape` into +/// `targetShape` through a simple traversal. If successful, an array of source +/// index ranges is returned, correspondingly to each dimension in the target +/// shape. The resulting indices shall fully cover the `sourceShape` without +/// overlaps. +/// +/// The algorithm is essentially a lazy one, searching for non-greedy matches - +/// it will only yield a greedy match for the last target dimension. +/// FIXME: The algorithm can only backtrack when it needs to append an offset +/// for a static target dimension to the preceding dynamic one (this retains the +/// linear complexity). As feasible, consider adding further backtracking +/// routines to enable more reassociations, e.g.: +/// - ?x2x?x2 into ?x2 +static FailureOr> +findReassociationRangesForCollapse(ArrayRef sourceShape, + ArrayRef targetShape) { + unsigned numSourceDims = sourceShape.size(), + numTargetDims = targetShape.size(); + assert(numSourceDims > numTargetDims); + ReassociationIndexRange sourceShapeAsRange{0, numSourceDims - 1}; + + SmallVector reassocRanges; + reassocRanges.reserve(numTargetDims); + // We'll iterate in strides of 2 to enable pseudo-backtracking for simple + // cases, e.g.: + // - ?x2x3x5 into ?x15 + std::optional prevTargetSize = std::nullopt; + for (unsigned targetDimIdx = 0, sourceDimIdx = 0; + targetDimIdx < numTargetDims; ++targetDimIdx) { + int64_t targetSize = targetShape[targetDimIdx]; + // Simply check if there are any subsequent target dimensions left - if not, + // the match must be made greedily. + bool shouldMatchGreedily = targetDimIdx == numTargetDims - 1; + FailureOr sourceRange; + if (targetSize == ShapedType::kDynamic) { + sourceRange = findReassociationRangeForDynamicDim( + sourceShape, sourceDimIdx, shouldMatchGreedily); + } else { + sourceRange = findReassociationRangeForSize( + sourceShape, sourceDimIdx, targetSize, shouldMatchGreedily); } - // If the current expanded dimension is dynamic, then the collapsed - // dimensions should also be dynamic and product of all previous unprocessed - // dimensions of the expanded shape should be 1. - if (sourceShape[sourceDim] == ShapedType::kDynamic && - (currTargetShape != ShapedType::kDynamic || prodOfCollapsedDims != 1)) - return std::nullopt; - - // If the collapsed dim is dynamic, the current expanded dim should also - // be dynamic. - if (currTargetShape == ShapedType::kDynamic && - sourceShape[sourceDim] != ShapedType::kDynamic) - return std::nullopt; - - // For static shapes, if the product of dimensions of the expanded shape - // should match the collapsed dimension shape. - if (prodOfCollapsedDims * sourceShape[sourceDim] != currTargetShape) - return std::nullopt; - - currIndices.push_back(sourceDim++); - reassociationMap.emplace_back(ReassociationIndices{}); - std::swap(reassociationMap.back(), currIndices); - prodOfCollapsedDims = 1; + // Run sanity checks on the returned index range. + if (failed(sourceRange) || failed(sourceRange->verify()) || + !sourceRange->isInRange(sourceShapeAsRange)) + return failure(); + if (sourceRange->leftIdx > sourceDimIdx) { + // If some source dimensions had to be skipped in order to find a match, + // they must be collapsed into the directly preceding dynamic dimension. + if (!prevTargetSize || prevTargetSize != ShapedType::kDynamic) + return failure(); + reassocRanges.back().rightIdx = sourceRange->leftIdx - 1; + } + + // Store the gathered information as required for the next iteration. + prevTargetSize = targetSize; + sourceDimIdx = sourceRange->rightIdx + 1; + reassocRanges.push_back(*sourceRange); } - // All the dimensions in the target must have been processed. - if (reassociationMap.size() != targetShape.size()) + // Fail if the source shape wasn't a full match for the target shape. We only + // need to check the last recorded index - any other gaps should have been + // mended by the main loop. + if (reassocRanges.back().rightIdx < sourceShapeAsRange.rightIdx) + return failure(); + return reassocRanges; +} + +/// A variant of `findReassociationRangesForCollapse(...)` that can also scan +/// the shapes right-to-left. +static FailureOr> +findReassociationRangesForCollapse(ArrayRef sourceShape, + ArrayRef targetShape, + bool iterateRightToLeft) { + if (!iterateRightToLeft) + return findReassociationRangesForCollapse(sourceShape, targetShape); + // NB: To iterate right-to-left, we currently reverse the shapes and then + // reverse the result back. The reversed shapes must not be temporary, as + // we're passing through an ArrayRef. + // FIXME: It would be preferable to avoid the expensive copies. At the moment, + // this approach is chosen for readability of the main implementation. + std::vector sourceToReverse = sourceShape.vec(), + targetToReverse = targetShape.vec(); + std::reverse(sourceToReverse.begin(), sourceToReverse.end()); + std::reverse(targetToReverse.begin(), targetToReverse.end()); + auto invertedRanges = + findReassociationRangesForCollapse(sourceToReverse, targetToReverse); + if (failed(invertedRanges)) + return failure(); + SmallVector &rangesToInvert = *invertedRanges; + unsigned numSourceDims = sourceShape.size(); + // We have received the ranges for inverted shapes. Now we have to invert + // the ranges back to correspond with the original source shape. + for (auto &range : rangesToInvert) { + int64_t invLeftIdx = range.leftIdx, invRightIdx = range.rightIdx; + range.leftIdx = numSourceDims - 1 - invRightIdx; + range.rightIdx = numSourceDims - 1 - invLeftIdx; + } + // Also invert the ordering of the ranges to correspond with the original + // target shape. + std::reverse(rangesToInvert.begin(), rangesToInvert.end()); + return rangesToInvert; +} + +std::optional> +mlir::getReassociationIndicesForCollapse(ArrayRef sourceShape, + ArrayRef targetShape) { + unsigned numSourceDims = sourceShape.size(), + numTargetDims = targetShape.size(); + // We're supposed to search for a collapsing reassociation. If the sizes + // match, there's no actual collapsing taking place - it's either a no-op or a + // `tensor.reshape`-style reassociation (that would be beyond the scope of + // this utility). + if (numSourceDims <= numTargetDims) + return std::nullopt; + // Early handling for scalar target types. + if (numTargetDims == 0) { + ReassociationIndices allSourceIndices; + allSourceIndices.reserve(numSourceDims); + for (unsigned sourceDimIdx = 0; sourceDimIdx < numSourceDims; + ++sourceDimIdx) { + int64_t sourceSize = sourceShape[sourceDimIdx]; + // All source dimensions must be unit or dynamic. + if (sourceSize != 1 && sourceSize != ShapedType::kDynamic) + return std::nullopt; + allSourceIndices.push_back(sourceDimIdx); + } + return SmallVector{allSourceIndices}; + } + + // Collect source ranges by iterating over the target shape left-to-right. + FailureOr> maybeForwardRanges = + findReassociationRangesForCollapse(sourceShape, targetShape); + if (failed(maybeForwardRanges)) + return std::nullopt; + auto &ranges = *maybeForwardRanges; + // Now do the same in reverse. We need to get another valid reassociation + // through some other strategy, and then compare the results in order to + // disambiguate mixed subshapes, such as: + // ?x?x? into ?x?, ?x2x? into ?x?, ?x2x3x6x? into ?x6x? + // This leads us to lose some of the reassociation opportunities that can only + // be found by iterating in a certain direction, e.g. 2x2x? into 2x? - without + // backtracking, the algorithm will fail right-to-left. However, this is the + // best way to preserve correctness. + FailureOr> maybeReverseRanges = + findReassociationRangesForCollapse(sourceShape, targetShape, + /*iterateRightToLeft=*/true); + if (failed(maybeReverseRanges)) + return std::nullopt; + auto &reverseRanges = *maybeReverseRanges; + + if (ranges.size() != numTargetDims || reverseRanges.size() != numTargetDims) return std::nullopt; - // Process any remaining entries in the source shape. They all need to be - // 1 or dynamic. - for (; sourceDim < sourceShape.size(); sourceDim++) { - if (sourceShape[sourceDim] != ShapedType::kDynamic && - sourceShape[sourceDim] != 1) - return std::nullopt; - // The map is empty when the target type is a scalar. - if (!reassociationMap.empty()) - reassociationMap.back().push_back(sourceDim); + // Now we can check for ambiguity of each target dimension's reassociation. If + // successful, we put the full indices into our result map for the target + // shape. + SmallVector reassociationMap(numTargetDims); + for (unsigned targetDimIdx = 0; targetDimIdx < numTargetDims; + ++targetDimIdx) { + ReassociationIndexRange &range = ranges[targetDimIdx]; + ReassociationIndexRange &reverseRange = reverseRanges[targetDimIdx]; + // Get non-overlapping indices between the ranges + ReassociationIndices nonMatchingIndices = + range.getNonOverlappingIndicesWith(reverseRange); + // Unit dimensions can be collapsed wherever - this is the only ambiguity + // that we allow. + for (int64_t sourceDimIdx : nonMatchingIndices) { + if (sourceShape[sourceDimIdx] != 1) + return std::nullopt; + } + reassociationMap[targetDimIdx] = range.getFullIndices(); } return reassociationMap; } diff --git a/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir b/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir index 51350e5bc8498..6979770154bab 100644 --- a/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir +++ b/mlir/test/Dialect/Linalg/simplify-pack-unpack.mlir @@ -158,8 +158,8 @@ func.func @unpack_to_partial_slice(%arg0: tensor<8x32xf32>) -> tensor<255xf32> { // ----- // CHECK-LABEL: func.func @unpack_dynamic -// CHECK-NOT: tensor.collapse -// CHECK: linalg.unpack +// CHECK: tensor.collapse +// CHECK-NOT: linalg.unpack func.func @unpack_dynamic(%arg0: tensor) -> tensor { %c32 = arith.constant 32 : index %c0 = arith.constant 0 : index diff --git a/mlir/test/Dialect/Tensor/canonicalize.mlir b/mlir/test/Dialect/Tensor/canonicalize.mlir index 67b03b0a3485b..3251c5a4a2bfd 100644 --- a/mlir/test/Dialect/Tensor/canonicalize.mlir +++ b/mlir/test/Dialect/Tensor/canonicalize.mlir @@ -1101,7 +1101,7 @@ func.func @fold_expand_of_collapse(%arg0 : tensor<3x4x4xf32>) -> tensor<3x4x4xf3 // ----- -func.func @fold_expand_of_collapse_dynamic(%arg0 : tensor, %arg1: index, %arg2: index) +func.func @fold_expand_of_collapse_mixed_subshape(%arg0 : tensor, %arg1: index, %arg2: index) -> tensor { %0 = tensor.collapse_shape %arg0 [[0, 1], [2]] : tensor into tensor @@ -1109,12 +1109,28 @@ func.func @fold_expand_of_collapse_dynamic(%arg0 : tensor, %arg1: ind : tensor into tensor return %1 : tensor } -// CHECK-LABEL: @fold_expand_of_collapse_dynamic +// CHECK-LABEL: @fold_expand_of_collapse_mixed_subshape // CHECK-NOT: tensor.{{.*}}_shape // ----- -func.func @no_fold_expand_of_collapse_dynamic(%arg0 : tensor, %arg1: index, %arg2: index, %arg3: index) +func.func @fold_expand_of_collapse_mixed_target_subshape(%arg0 : tensor, %arg1: index, %arg2: index) + -> tensor { + %0 = tensor.collapse_shape %arg0 [[0, 1], [2, 3]] + : tensor into tensor + %1 = tensor.expand_shape %0 [[0, 1], [2]] output_shape [%arg1, 4, %arg2] + : tensor into tensor + return %1 : tensor +} +// CHECK-LABEL: @fold_expand_of_collapse_mixed_target_subshape +// CHECK-NOT: tensor.expand_shape +// CHECK: %[[COLLAPSE:.+]] = tensor.collapse_shape %arg0 {{\[}}[0], [1], [2, 3]] +// CHECK-SAME: : tensor into tensor +// CHECK-NEXT: return %[[COLLAPSE]] + +// ----- + +func.func @no_fold_expand_of_collapse_fully_dynamic(%arg0 : tensor, %arg1: index, %arg2: index, %arg3: index) -> tensor { %0 = tensor.collapse_shape %arg0 [[0, 1], [2]] : tensor into tensor @@ -1122,7 +1138,22 @@ func.func @no_fold_expand_of_collapse_dynamic(%arg0 : tensor, %arg1: : tensor into tensor return %1 : tensor } -// CHECK-LABEL: @no_fold_expand_of_collapse_dynamic +// CHECK-LABEL: @no_fold_expand_of_collapse_fully_dynamic +// CHECK: tensor.collapse_shape +// CHECK: %[[EXPAND:.+]] = tensor.expand_shape +// CHECK: return %[[EXPAND]] + +// ----- + +func.func @no_fold_expand_of_collapse_adjacent_dynamic(%arg0 : tensor, %arg1: index, %arg2: index) + -> tensor { + %0 = tensor.collapse_shape %arg0 [[0, 1, 2]] + : tensor into tensor + %1 = tensor.expand_shape %0 [[0, 1]] output_shape [%arg1, %arg2] + : tensor into tensor + return %1 : tensor +} +// CHECK-LABEL: @no_fold_expand_of_collapse_adjacent_dynamic // CHECK: tensor.collapse_shape // CHECK: %[[EXPAND:.+]] = tensor.expand_shape // CHECK: return %[[EXPAND]] diff --git a/mlir/unittests/Dialect/Utils/CMakeLists.txt b/mlir/unittests/Dialect/Utils/CMakeLists.txt index 61b9cdcb3b8f3..e921c8bcfb4e5 100644 --- a/mlir/unittests/Dialect/Utils/CMakeLists.txt +++ b/mlir/unittests/Dialect/Utils/CMakeLists.txt @@ -1,5 +1,6 @@ add_mlir_unittest(MLIRDialectUtilsTests StructuredOpsUtilsTest.cpp + ReshapeOpsUtilsTest.cpp IndexingUtilsTest.cpp ) mlir_target_link_libraries(MLIRDialectUtilsTests diff --git a/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp b/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp new file mode 100644 index 0000000000000..db1a87a4de2d5 --- /dev/null +++ b/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp @@ -0,0 +1,203 @@ +//===- ReshapeOpsUtilsTest.cpp - ReshapeOpsUtils unit tests ---------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "mlir/Dialect/Utils/ReshapeOpsUtils.h" +#include "mlir/IR/BuiltinTypeInterfaces.h" +#include "llvm/ADT/STLExtras.h" +#include "gtest/gtest.h" +#include + +using namespace mlir; + +/// Helper to make constructing +/// `std::optional>` more readable. +static std::optional> +makeOptionalIndices(std::initializer_list list) { + return std::optional>(list); +} + +TEST(ReassociationIndicesForCollapse, ScalarTest) { + EXPECT_EQ(getReassociationIndicesForCollapse({1}, {}), + makeOptionalIndices({{0}})); + EXPECT_EQ(getReassociationIndicesForCollapse({1, 1}, {}), + makeOptionalIndices({{0, 1}})); + EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic}, {}), + makeOptionalIndices({{0}})); + EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic, + ShapedType::kDynamic, 1, + ShapedType::kDynamic}, + {}), + makeOptionalIndices({{0, 1, 2, 3, 4}})); +} + +TEST(ReassociationIndicesForCollapse, ScalarTestFailure) { + EXPECT_EQ(getReassociationIndicesForCollapse({}, {}), std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse({}, {1}), std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse({2}, {}), std::nullopt); + EXPECT_EQ( + getReassociationIndicesForCollapse({1, 2, ShapedType::kDynamic, 1}, {}), + std::nullopt); +} + +TEST(ReassociationIndicesForCollapse, StaticTest) { + EXPECT_EQ(getReassociationIndicesForCollapse({10, 20}, {200}), + makeOptionalIndices({{0, 1}})); + EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {10, 600}), + makeOptionalIndices({{0}, {1, 2}})); + EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {200, 30}), + makeOptionalIndices({{0, 1}, {2}})); +} + +TEST(ReassociationIndicesForCollapse, StaticTestFailure) { + // No-op reassociation + EXPECT_EQ(getReassociationIndicesForCollapse({10, 20}, {10, 20}), + std::nullopt); + // Invalid static reassociations + EXPECT_EQ(getReassociationIndicesForCollapse({10, 20}, {10}), std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {200, 300}), + std::nullopt); + // Non-collapsing (expanding) reassociation + EXPECT_EQ(getReassociationIndicesForCollapse({10, 20, 30}, {1, 10, 20, 30}), + std::nullopt); +} + +TEST(ReassociationIndicesForCollapse, StaticTestUnitDims) { + EXPECT_EQ(getReassociationIndicesForCollapse({10, 1}, {10}), + makeOptionalIndices({{0, 1}})); + EXPECT_EQ(getReassociationIndicesForCollapse({1, 20, 30}, {600}), + makeOptionalIndices({{0, 1, 2}})); + EXPECT_EQ(getReassociationIndicesForCollapse({1, 1, 1}, {1}), + makeOptionalIndices({{0, 1, 2}})); + EXPECT_EQ(getReassociationIndicesForCollapse({1, 1, 1, 1}, {1, 1, 1}), + makeOptionalIndices({{0}, {1}, {2, 3}})); +} + +TEST(ReassociationIndicesForCollapse, DynamicTest) { + EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 1}, + {ShapedType::kDynamic}), + makeOptionalIndices({{0, 1}})); + EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 1, 1}, + {ShapedType::kDynamic}), + makeOptionalIndices({{0, 1, 2}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {1, ShapedType::kDynamic, 1, ShapedType::kDynamic, 1}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + makeOptionalIndices({{0, 1}, {2, 3, 4}})); + EXPECT_EQ( + getReassociationIndicesForCollapse( + {ShapedType::kDynamic, ShapedType::kDynamic}, {ShapedType::kDynamic}), + makeOptionalIndices({{0, 1}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {1, ShapedType::kDynamic, ShapedType::kDynamic}, + {1, ShapedType::kDynamic}), + makeOptionalIndices({{0}, {1, 2}})); + + EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 10}, + {ShapedType::kDynamic}), + makeOptionalIndices({{0, 1}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {1, ShapedType::kDynamic, ShapedType::kDynamic}, + {ShapedType::kDynamic}), + makeOptionalIndices({{0, 1, 2}})); + EXPECT_EQ(getReassociationIndicesForCollapse({10, ShapedType::kDynamic}, + {ShapedType::kDynamic}), + makeOptionalIndices({{0, 1}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 1, 2, ShapedType::kDynamic, 10}, + {ShapedType::kDynamic, 10}), + makeOptionalIndices({{0, 1, 2, 3}, {4}})); + EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 10, 20}, + {ShapedType::kDynamic, 20}), + makeOptionalIndices({{0, 1}, {2}})); + EXPECT_EQ(getReassociationIndicesForCollapse({10, ShapedType::kDynamic, 20}, + {ShapedType::kDynamic, 20}), + makeOptionalIndices({{0, 1}, {2}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 3, 2, 5, 2}, {ShapedType::kDynamic, 20}), + makeOptionalIndices({{0, 1}, {2, 3, 4}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {10, ShapedType::kDynamic, 20, ShapedType::kDynamic, 1}, + {ShapedType::kDynamic, 20, ShapedType::kDynamic}), + makeOptionalIndices({{0, 1}, {2}, {3, 4}})); + EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic, 1}, + {ShapedType::kDynamic}), + makeOptionalIndices({{0, 1, 2}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, ShapedType::kDynamic, 1}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + makeOptionalIndices({{0}, {1, 2}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {1, ShapedType::kDynamic, ShapedType::kDynamic}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + makeOptionalIndices({{0, 1}, {2}})); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 1, ShapedType::kDynamic}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + makeOptionalIndices({{0}, {1, 2}})); +} + +TEST(ReassociationIndicesForCollapse, DynamicTestFailure) { + EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic, 10, 20}, + {ShapedType::kDynamic, 10}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 10, ShapedType::kDynamic}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {20, ShapedType::kDynamic, 10, ShapedType::kDynamic}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 5, 3, 2, 2}, {ShapedType::kDynamic, 20}), + std::nullopt); + EXPECT_EQ( + getReassociationIndicesForCollapse( + {ShapedType::kDynamic, ShapedType::kDynamic, ShapedType::kDynamic}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, ShapedType::kDynamic, 10, 1, + ShapedType::kDynamic}, + {ShapedType::kDynamic, ShapedType::kDynamic}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 10, 10, 10, ShapedType::kDynamic}, + {ShapedType::kDynamic, 10, ShapedType::kDynamic}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 10, 10, 10, ShapedType::kDynamic}, + {ShapedType::kDynamic, 2, 2, ShapedType::kDynamic}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 3, 4, 3, ShapedType::kDynamic}, + {ShapedType::kDynamic, 12, ShapedType::kDynamic}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 8, 4, 2, 16, ShapedType::kDynamic}, + {ShapedType::kDynamic, 32, ShapedType::kDynamic}), + std::nullopt); + + //===----------------------------------------------------------------------===// + // TODO: Reassociation for the following examples can be computed, but isn't + // supported by `getReassociationIndicesForCollapse`. + //===----------------------------------------------------------------------===// + + // TODO: Fails because there's no backtracking when some source dimensions + // remain unmatched at either edge. + EXPECT_EQ(getReassociationIndicesForCollapse( + {ShapedType::kDynamic, 10, ShapedType::kDynamic, 10}, + {ShapedType::kDynamic, 10}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic, 2, 2}, + {1, ShapedType::kDynamic, 2}), + std::nullopt); + EXPECT_EQ(getReassociationIndicesForCollapse({2, 2, ShapedType::kDynamic, 1}, + {2, ShapedType::kDynamic}), + std::nullopt); +} From edaac11df3f82268e8ca34bf34b3e9d115b7d475 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 12 Jun 2025 09:29:41 +0100 Subject: [PATCH 196/851] [X86] combineSelect - attempt to combine with shuffles (#143753) Before legalization we will convert to a vector_shuffle node - but afterward we can try to combine the select into an existing target shuffle chain --- llvm/lib/Target/X86/X86ISelLowering.cpp | 16 +- .../CodeGen/X86/combine-mask-with-shuffle.ll | 32 +- llvm/test/CodeGen/X86/pr132844.ll | 11 +- .../vector-interleaved-load-i8-stride-7.ll | 1166 ++++--- .../vector-interleaved-store-i16-stride-8.ll | 2864 ++++++++--------- .../vector-interleaved-store-i8-stride-5.ll | 30 +- .../vector-interleaved-store-i8-stride-6.ll | 2026 ++++++------ .../vector-interleaved-store-i8-stride-7.ll | 231 +- .../vector-interleaved-store-i8-stride-8.ll | 1096 +++---- .../X86/vector-shuffle-combining-avx512f.ll | 40 +- 10 files changed, 3610 insertions(+), 3902 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 96714adf78e43..b0553aa4b8197 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -47785,13 +47785,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, DL, DAG, Subtarget)) return V; - // Convert vselects with constant condition into shuffles. - if (CondConstantVector && DCI.isBeforeLegalizeOps() && - (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV)) { + if (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::BLENDV) { SmallVector Mask; if (createShuffleMaskFromVSELECT(Mask, Cond, - N->getOpcode() == X86ISD::BLENDV)) - return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask); + N->getOpcode() == X86ISD::BLENDV)) { + // Convert vselects with constant condition into shuffles. + if (DCI.isBeforeLegalizeOps()) + return DAG.getVectorShuffle(VT, DL, LHS, RHS, Mask); + + // Attempt to combine as shuffle. + SDValue Op(N, 0); + if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget)) + return Res; + } } // fold vselect(cond, pshufb(x), pshufb(y)) -> or (pshufb(x), pshufb(y)) diff --git a/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll b/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll index 268ac3dd31b85..7564e65a428b7 100644 --- a/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll +++ b/llvm/test/CodeGen/X86/combine-mask-with-shuffle.ll @@ -67,11 +67,9 @@ define <16 x i32> @combine_mask_with_abs(<16 x i32> %v0) { define <16 x i32> @combine_mask_with_umin(<16 x i32> %v0) { ; CHECK-LABEL: combine_mask_with_umin: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpminud %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: vpminud %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: movw $-3856, %ax # imm = 0xF0F0 ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpopcntd %zmm0, %zmm1 {%k1} @@ -88,11 +86,9 @@ define <16 x i32> @combine_mask_with_umin(<16 x i32> %v0) { define <16 x i32> @combine_mask_with_umax(<16 x i32> %v0) { ; CHECK-LABEL: combine_mask_with_umax: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpmaxud %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: vpmaxud %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: movw $-3856, %ax # imm = 0xF0F0 ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpopcntd %zmm0, %zmm1 {%k1} @@ -109,11 +105,9 @@ define <16 x i32> @combine_mask_with_umax(<16 x i32> %v0) { define <16 x i32> @combine_mask_with_smin(<16 x i32> %v0) { ; CHECK-LABEL: combine_mask_with_smin: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpminsd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: vpminsd %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: movw $-3856, %ax # imm = 0xF0F0 ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpopcntd %zmm0, %zmm1 {%k1} @@ -130,11 +124,9 @@ define <16 x i32> @combine_mask_with_smin(<16 x i32> %v0) { define <16 x i32> @combine_mask_with_smax(<16 x i32> %v0) { ; CHECK-LABEL: combine_mask_with_smax: ; CHECK: # %bb.0: -; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] -; CHECK-NEXT: movw $-21846, %ax # imm = 0xAAAA -; CHECK-NEXT: kmovw %eax, %k1 -; CHECK-NEXT: vpblendmd %zmm1, %zmm0, %zmm2 {%k1} -; CHECK-NEXT: vpmaxsd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpshufd {{.*#+}} zmm1 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] +; CHECK-NEXT: vpshufd {{.*#+}} zmm2 = zmm0[1,0,3,2,5,4,7,6,9,8,11,10,13,12,15,14] +; CHECK-NEXT: vpmaxsd %zmm1, %zmm2, %zmm1 ; CHECK-NEXT: movw $-3856, %ax # imm = 0xF0F0 ; CHECK-NEXT: kmovw %eax, %k1 ; CHECK-NEXT: vpopcntd %zmm0, %zmm1 {%k1} diff --git a/llvm/test/CodeGen/X86/pr132844.ll b/llvm/test/CodeGen/X86/pr132844.ll index ded100b2accce..dc9f006d93d12 100644 --- a/llvm/test/CodeGen/X86/pr132844.ll +++ b/llvm/test/CodeGen/X86/pr132844.ll @@ -4,12 +4,11 @@ define { ptr, i8 } @PR132844(<4 x ptr> %0, <4 x ptr> %1) { ; CHECK-LABEL: PR132844: ; CHECK: # %bb.0: -; CHECK-NEXT: vextracti128 $1, %ymm0, %xmm2 -; CHECK-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; CHECK-NEXT: movb $10, %al -; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vinserti64x2 $1, 16, %ymm2, %ymm0 {%k1} -; CHECK-NEXT: vmovdqu %ymm0, {{[0-9]+}}(%rsp) +; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 +; CHECK-NEXT: vinsertf128 $1, 16, %ymm2, %ymm2 +; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; CHECK-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7] +; CHECK-NEXT: vmovups %ymm0, {{[0-9]+}}(%rsp) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: xorl %edx, %edx ; CHECK-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll index c132c5ea2ef49..82481269022b0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll @@ -13723,364 +13723,361 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512BW-FCP-LABEL: load_i8_stride7_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm16 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25 +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm17 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm12 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16 +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm18 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm7 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm8 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm6 +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm7 ; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm1 {%k1} ; AVX512BW-FCP-NEXT: kmovq %k1, %k2 ; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm1, %xmm1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm1 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX512BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244 -; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm4 {%k1} -; AVX512BW-FCP-NEXT: kmovq %k1, %k3 -; AVX512BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6] -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512BW-FCP-NEXT: kmovd %eax, %k6 +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm9 {%k6} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,2,4,6] +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm14 +; AVX512BW-FCP-NEXT: vpermd %ymm14, %ymm10, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] ; AVX512BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm9, %zmm9 ; AVX512BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} -; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm1 {%k5} +; AVX512BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX512BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 ; AVX512BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm8 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm8, %xmm21 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: kmovd %eax, %k3 +; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm9, %ymm11 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm11[u,u,u,u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm11, %xmm21 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512BW-FCP-NEXT: kmovd %eax, %k7 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm21 {%k7} -; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm21 {%k7} +; AVX512BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm11 +; AVX512BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm7 ; AVX512BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm18 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22 +; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm7, %ymm15 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm22 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm15[4,11],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm15, %ymm0, %ymm22 ; AVX512BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23 +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm15 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm23 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[1,8,15],zero,zero,xmm15[4,11],zero,zero,xmm15[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm7 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6] -; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm8 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm15 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm15[u,u,u,6,13],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u],zero,zero,xmm15[4,11],zero,zero,xmm15[0,7,14,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm23, %xmm15, %xmm15 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm23 = [0,0,0,0,1,3,4,6] +; AVX512BW-FCP-NEXT: vpermd %ymm14, %ymm23, %ymm23 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm23[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm15[0,1,2,3,4,5,6],ymm13[7] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm7 {%k5} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm8 {%k5} +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm13 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX512BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k5 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm12 {%k2} ; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] ; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 ; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6] -; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpermd %ymm14, %ymm15, %ymm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm12, %zmm12 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm13 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm14, %xmm13, %xmm23 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm23 {%k5} = ymm24[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm13 {%k4} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[2,9,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 +; AVX512BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12] -; AVX512BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm14 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm24 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[5,12] +; AVX512BW-FCP-NEXT: vmovdqa 192(%rdi), %xmm15 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm25 = xmm15[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm24, %xmm25, %xmm24 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm24, %ymm0, %ymm24 ; AVX512BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm24, %ymm13 {%k2} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15 -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} -; AVX512BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14 +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm19, %xmm19 +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm13, %zmm13 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm23, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm19 ; AVX512BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 -; AVX512BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k4} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} +; AVX512BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm1 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm10, %ymm19 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm20 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u,u,u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u] +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm19, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm18, %ymm19 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm7, %ymm18 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] +; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm19 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm18 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm18, %zmm8 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm10, %ymm18 {%k6} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm18 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm11, %ymm7, %ymm17 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm17, %xmm17 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm18 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm17 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm17, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm10, %ymm17 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,6,13],zero,zero,xmm17[2,9],zero,zero,zero,xmm17[u,u] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u],zero,zero,xmm17[4,11],zero,zero,xmm17[0,7,14,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm17 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm16 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[5,12] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,4,11],zero,zero,xmm16[0,7,14],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm18, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm17 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm0, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm16, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm16, %zmm16 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm17, %zmm17 ; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm15 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm7 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm15 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11] -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2} -; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm15 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16 -; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm14 {%k2} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 +; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm9, %ymm18 {%k1} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm19 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] -; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} -; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k4} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,0,7,14],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[u,u] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm18 {%k7} +; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm9, %ymm17 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm19 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,1,8,15],zero,zero,xmm17[4,11],zero,zero,xmm17[u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm17, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm19 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1} -; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm16 {%k1} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3} -; AVX512BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm16 {%k6} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14] -; AVX512BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3} -; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4} -; AVX512BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1} -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6} -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6} -; AVX512BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k3 # 4-byte Reload -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k3} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm19 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm16, %zmm20 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm16, %zmm17 +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm2, %ymm16 {%k4} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm21 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[4,11],zero,zero,xmm16[0,7,14],zero,zero,xmm16[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm16, %xmm16 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm17 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,2,9],zero,zero,zero,xmm17[5,12],zero,zero,xmm17[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u],zero,zero,xmm17[0,7,14],zero,zero,xmm17[3,10,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm17, %xmm17 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm21 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm14[6,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm15[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm17, %xmm22, %xmm17 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm21 {%k7} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [1,2,4,6,0,0,0,0] +; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm17 +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm22, %ymm22 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm22, %zmm21, %zmm21 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm16 {%k5} +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm21 {%k6} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm22 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,xmm22[3,10],zero,zero,zero,xmm22[6,13] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u,5,12],zero,zero,xmm21[1,8,15],zero,zero +; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm21, %xmm21 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm18 {%k2} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm18, %zmm0, %zmm18 +; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 +; AVX512BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm21 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm21[u,u,u,u,u,u,u,6,13],zero,zero,xmm21[2,9],zero,zero,zero +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,xmm21[4,11],zero,zero,xmm21[0,7,14] +; AVX512BW-FCP-NEXT: vporq %xmm22, %xmm21, %xmm21 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm19 {%k2} +; AVX512BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vpblendmw %ymm4, %ymm2, %ymm18 {%k6} +; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm21 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[3,10],zero,zero,zero,xmm21[6,13,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[5,12],zero,zero,xmm18[1,8,15],zero,zero,xmm18[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm18, %xmm18 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm20 {%k3} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] ; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14] -; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20 +; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm20, %xmm20 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k3} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9 -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm20 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15] -; AVX512BW-FCP-NEXT: vporq %xmm9, %xmm17, %xmm9 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm20 {%k3} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm17 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm18 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm9 -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,2,4,6,0,0,0,0] -; AVX512BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm19 -; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm9 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm9 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm2 {%k5} -; AVX512BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,4,6,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm10, %ymm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm9 {%k5} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,5,6,0,0,0,0] -; AVX512BW-FCP-NEXT: vpermd %ymm19, %ymm10, %ymm10 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm20, %zmm10 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm3 {%k5} -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm6 {%k2} -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm8 -; AVX512BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] -; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512BW-FCP-NEXT: movl $4186112, %edi # imm = 0x3FE000 -; AVX512BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm15[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[0,7,14] +; AVX512BW-FCP-NEXT: vporq %xmm21, %xmm22, %xmm21 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm20 {%k7} +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [1,3,4,6,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm21, %ymm21 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm21, %zmm20, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm18 {%k5} +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm0, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm19, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm7 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm11, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm9 {%k6} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm9 {%k1} -; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm0 -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero -; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm4, %zmm5 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm0, %zmm3 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm2 {%k3} +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k4} +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm15[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm14[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm14[1,8,15] +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm3 {%k7} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,6,0,0,0,0] +; AVX512BW-FCP-NEXT: vpermd %ymm17, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm2 {%k5} ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX512BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero +; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm3, %zmm4 +; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,17,18,19,20,21,22,23] +; AVX512BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdi) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, (%r9) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rdi) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -14453,362 +14450,359 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr ; ; AVX512DQ-BW-FCP-LABEL: load_i8_stride7_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 64(%rdi), %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 320(%rdi), %zmm3 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,19,28,21,6,31,16,9,26,27,20,13,30,23] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm24 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm24 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [8,1,18,11,4,5,22,15,0,25,10,3,12,29,14,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] ; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [24,17,2,27,20,5,22,31,16,9,26,19,12,29,30,23] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,18,11,4,21,14,7,8,25,10,3,28,13,6,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm12 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,2,11,4,5,14,7,8,9,26,19,12,29,22,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm18 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,17,10,3,4,21,14,7,24,9,2,11,28,13,6,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm8 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [0,1,10,3,4,5,14,7,8,25,18,11,12,29,22,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm1, %zmm7 ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm1 = [16,17,10,3,20,13,6,23,24,25,18,27,28,21,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm1, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm1, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: movw $-28382, %ax # imm = 0x9122 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k2 ; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[5,12],zero,zero,xmm5[1,8,15,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm1, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[5,12],zero,zero,xmm6[1,8,15,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,7,14],zero,zero,xmm1[3,10],zero,zero,zero,xmm1[u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm1, %xmm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm6, %xmm1, %xmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movw $992, %ax # imm = 0x3E0 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 128(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 160(%rdi), %ymm5 ; AVX512DQ-BW-FCP-NEXT: movw $8772, %ax # imm = 0x2244 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: kmovq %k1, %k3 -; AVX512DQ-BW-FCP-NEXT: kmovw %k1, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[u,u,u],zero,zero,xmm5[3,10],zero,zero,zero,xmm5[6,13,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,5,12],zero,zero,xmm4[1,8,15],zero,zero,xmm4[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm5 = [0,0,0,0,1,2,4,6] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %ymm17 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm5, %ymm5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3,4,5,6],ymm5[7] +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm9 {%k6} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u,u],zero,zero,xmm10[3,10],zero,zero,zero,xmm10[6,13,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,5,12],zero,zero,xmm9[1,8,15],zero,zero,xmm9[u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,1,2,4,6] +; AVX512DQ-BW-FCP-NEXT: vmovdqa 192(%rdi), %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm10, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,23,26,29] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5,6],ymm10[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 240(%rdi), %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm19[5,12,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm8, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm5, %zmm4, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[0,7,14],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm11, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm9, %zmm9 ; AVX512DQ-BW-FCP-NEXT: movabsq $137438429184, %rax # imm = 0x1FFFF80000 ; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm4, %zmm1 {%k5} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm9, %zmm1 {%k5} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 288(%rdi), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 256(%rdi), %ymm9 ; AVX512DQ-BW-FCP-NEXT: movw $9288, %ax # imm = 0x2448 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k6 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm8 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm8[u,u,u,u,u,3,10],zero,zero,zero,xmm8[6,13],zero,zero,xmm8[u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm8, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u],zero,zero,xmm8[1,8,15],zero,zero,xmm8[4,11,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm8, %xmm21 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm9, %ymm11 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm11[u,u,u,u,u,3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm11, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm11[u,u,u,u,u],zero,zero,xmm11[1,8,15],zero,zero,xmm11[4,11,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm11, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,2,9,16,23,30,21,28,19,26,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movw $3968, %ax # imm = 0xF80 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k7 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm21 {%k7} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm8 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm7, %ymm21 {%k7} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 416(%rdi), %ymm11 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 384(%rdi), %ymm7 ; AVX512DQ-BW-FCP-NEXT: movw $4644, %ax # imm = 0x1224 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k4 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm18 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm22 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm7, %ymm16 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm22 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[u,u,u,u,u,u,u],zero,zero,zero,xmm22[6,13],zero,zero,xmm22[2,9] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm22 ; AVX512DQ-BW-FCP-NEXT: movl $-8388608, %eax # imm = 0xFF800000 -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm18 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm23 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm16 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm23 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = zero,zero,zero,xmm23[6,13],zero,zero,xmm23[2,9,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $511, %r10d # imm = 0x1FF ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm7 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm9, %ymm18 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,6,13],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u],zero,zero,xmm18[4,11],zero,zero,xmm18[0,7,14,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm18, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm14 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm18 = [0,0,0,0,1,3,4,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm18, %ymm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm14 = ymm14[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm18, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm14, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm7 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[2,9],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[0,7,14],zero,zero,xmm14[3,10,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm5, %ymm16 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm16[u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm16, %xmm16 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm16 = [0,0,0,0,1,3,4,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,17,20,27,30] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm13 = ymm14[0,1,2,3,4,5,6],ymm13[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,zero,xmm19[6,13,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm20[1,8,15],zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm14, %xmm16, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm13, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm13, %zmm8 {%k5} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm13 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[2,9],zero,zero,zero,xmm13[5,12],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[0,7,14],zero,zero,xmm13[3,10,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm13 ; AVX512DQ-BW-FCP-NEXT: movl $261632, %r10d # imm = 0x3FE00 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k5 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[5,12],zero,zero,xmm15[1,8,15,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm13 {%k5} = ymm12[u,u,u,u,u,u,u,u,u,1,8,15,6,13,4,11,18,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm12 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm12, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u],zero,zero,zero,xmm14[5,12],zero,zero,xmm14[1,8,15,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm12 = xmm12[u,u,0,7,14],zero,zero,xmm12[3,10],zero,zero,zero,xmm12[u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm12, %xmm12 +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm12, %xmm12 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm12, %ymm0, %ymm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm15 = [0,0,0,0,1,3,5,6] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm17, %ymm15, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] -; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm15[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm17, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm12, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm3, %ymm10, %ymm14 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[3,10],zero,zero,zero,xmm14[6,13],zero,zero,xmm14[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = zero,zero,xmm14[1,8,15],zero,zero,xmm14[4,11,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm13[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm13 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm14 = [0,0,0,0,1,3,5,6] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm15, %ymm14, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,21,24,31] +; AVX512DQ-BW-FCP-NEXT: vpblendd {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,6],ymm14[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm20[2,9],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = zero,zero,xmm19[0,7,14,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm15, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm14, %zmm12, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm2, %ymm4, %ymm13 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm13[3,10],zero,zero,zero,xmm13[6,13],zero,zero,xmm13[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = zero,zero,xmm13[1,8,15],zero,zero,xmm13[4,11,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm14, %xmm13, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm25[u,u,u,u,u,u,u,u,u,2,9,0,7,14,5,12,19,26,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm13 {%k4} ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm13, %xmm15 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u],zero,zero,zero,xmm15[6,13],zero,zero,xmm15[2,9,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm13 = xmm13[u,u,1,8,15],zero,zero,xmm13[4,11],zero,zero,xmm13[u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm13, %xmm13 ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm0, %ymm13 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 208(%rdi), %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[5,12] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm18[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm23, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 208(%rdi), %xmm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm23 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[5,12] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 192(%rdi), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm25 = xmm16[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm23, %xmm25, %xmm23 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm0, %ymm23 ; AVX512DQ-BW-FCP-NEXT: movl $-134217728, %r10d # imm = 0xF8000000 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 ; AVX512DQ-BW-FCP-NEXT: kmovd %k2, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm13 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm23, %ymm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[3,10],zero,zero,zero,xmm20[u,u,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = zero,zero,xmm19[1,8,15,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm15, %xmm19, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm15, %zmm13, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm19, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm19, %zmm13, %zmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm13 {%k1} -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k3} +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm22, %ymm21 {%k2} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm0, %zmm14 ; AVX512DQ-BW-FCP-NEXT: movabsq $-137438953472, %rax # imm = 0xFFFFFFE000000000 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[2,9],zero,zero,zero,xmm15[5,12,u,u] +; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm1 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm10, %ymm14 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u],zero,zero,xmm19[2,9],zero,zero,zero,xmm19[5,12,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} -; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm15[5,12],zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14],zero,zero,xmm15[3,10] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,3,10,17,24,31,22,29,20,27,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm18, %ymm14 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm7, %ymm18 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm18[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm7 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u],zero,zero,xmm15[3,10],zero,zero,zero,xmm15[6,13,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm10, %ymm14 {%k6} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u],zero,zero,xmm18[3,10],zero,zero,zero,xmm18[6,13,u,u] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero,xmm14[u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm25[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm8, %ymm6, %ymm15 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm15[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm15[6,13],zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15],zero,zero,xmm15[4,11] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm17[u,u,u,u,u,u,u,u,u,u,u,u,u,u,4,11,18,25,16,23,30,21,28,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm14 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm11, %ymm7, %ymm17 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm17[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm17[6,13],zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15],zero,zero,xmm17[4,11] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm5, %ymm14 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm10, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm14[u,u,u,u,u,6,13],zero,zero,xmm14[2,9],zero,zero,zero,xmm14[u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm14, %xmm14 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u],zero,zero,xmm14[4,11],zero,zero,xmm14[0,7,14,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm15, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm15, %ymm14 {%k7} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm15 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm15, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[2,9],zero,zero,zero,xmm16[5,12] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm15 = xmm15[u,u,u,u,u,u,u,4,11],zero,zero,xmm15[0,7,14],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm16, %xmm15, %xmm15 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm0, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm15, %ymm14 {%k3} +; AVX512DQ-BW-FCP-NEXT: vporq %xmm17, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 = ymm24[u,u,u,u,u,u,u,u,u,u,u,u,u,u,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm17, %ymm14 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm17 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,u,u,u],zero,zero,xmm18[2,9],zero,zero,zero,xmm18[5,12] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,4,11],zero,zero,xmm17[0,7,14],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm17, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm0, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm17, %ymm14 {%k2} ; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm14, %zmm0, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm13 {%k1} ; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,9,2,3,4,13,6,7,24,17,10,11,28,21,14,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm17 = [16,25,18,3,28,21,6,23,24,17,10,27,20,13,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm17, %zmm18 ; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm9, %ymm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm19 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[5,12],zero,zero,xmm19[1,8,15,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,0,7,14],zero,zero,xmm14[3,10],zero,zero,zero,xmm14[u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm14, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm16 = ymm16[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm16, %ymm14 {%k7} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm4, %ymm16 {%k4} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,0,7,14],zero,zero,xmm17[3,10],zero,zero,zero,xmm17[u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm17, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm18 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm18, %ymm17 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm9, %ymm18 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm19 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u],zero,zero,zero,xmm19[6,13],zero,zero,xmm19[2,9,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,1,8,15],zero,zero,xmm16[4,11],zero,zero,xmm16[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u,u,u,1,8,15],zero,zero,xmm18[4,11],zero,zero,xmm18[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm19 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: movl $8176, %eax # imm = 0x1FF0 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[u,u,u,u,u,u,u],zero,zero,xmm19[3,10],zero,zero,zero,xmm19[6,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u,5,12],zero,zero,xmm16[1,8,15],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm14 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm6, %ymm8, %ymm16 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm16[u,u,u,u,u,u,u,6,13],zero,zero,xmm16[2,9],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm16 = xmm16[u,u,u,u,u,u,u],zero,zero,xmm16[4,11],zero,zero,xmm16[0,7,14] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm19, %xmm16, %xmm16 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm0, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm16, %ymm15 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm19 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm10, %ymm3, %ymm16 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm20 {%k6} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm3 {%k6} -; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm9, %ymm11, %ymm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm9 {%k4} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[u,u,2,9],zero,zero,zero,xmm10[5,12],zero,zero,xmm10[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm10, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[u,u],zero,zero,xmm10[0,7,14],zero,zero,xmm10[3,10,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm11, %xmm10, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm0, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm17[6,13] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero -; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm21, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 4-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm11, %ymm10 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm11 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm14, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm14 {%k6} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,xmm21[3,10],zero,zero,zero,xmm21[6,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[u,u,u,u,u,u,u,5,12],zero,zero,xmm14[1,8,15],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm14, %ymm0, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm14, %ymm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm2, %ymm14 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm14, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[2,9],zero,zero,zero,xmm21[5,12,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm14 = xmm14[4,11],zero,zero,xmm14[0,7,14],zero,zero,xmm14[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm14, %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm14 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm18 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm18[u,u,2,9],zero,zero,zero,xmm18[5,12],zero,zero,xmm18[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm18[u,u],zero,zero,xmm18[0,7,14],zero,zero,xmm18[3,10,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm18, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm18 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm15[6,13] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm16[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero +; AVX512DQ-BW-FCP-NEXT: vporq %xmm18, %xmm22, %xmm18 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm0, %ymm18 +; AVX512DQ-BW-FCP-NEXT: kmovd {{[-0-9]+}}(%r{{[sb]}}p), %k7 # 4-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm18, %ymm21 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm22 = [1,2,4,6,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm18, %ymm22, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm22[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm22, %zmm21, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm21, %zmm14 {%k5} +; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm7, %ymm11, %ymm21 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm21[u,u,u,u,u,u,u,6,13],zero,zero,xmm21[2,9],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm21, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[u,u,u,u,u,u,u],zero,zero,xmm21[4,11],zero,zero,xmm21[0,7,14] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm22, %xmm21, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm19 {%k2} +; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 +; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm17, %zmm0, %zmm14 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm4, %ymm2, %ymm17 {%k6} +; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm17, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = zero,zero,xmm21[3,10],zero,zero,zero,xmm21[6,13,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[5,12],zero,zero,xmm17[1,8,15],zero,zero,xmm17[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm17, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm17 {%k5} = ymm20[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpblendmw %ymm5, %ymm6, %ymm20 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm20[u,u,3,10],zero,zero,zero,xmm20[6,13],zero,zero,xmm20[u,u,u,u,u] ; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm20, %xmm20 ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[1,8,15],zero,zero,xmm20[4,11,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm11, %xmm20, %xmm11 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm11, %ymm0, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm18[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[0,7,14] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm21, %xmm20 +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm20, %xmm20 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm0, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm20, %ymm11 {%k2} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm9, %xmm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm20 = xmm20[u,u],zero,zero,xmm20[2,9],zero,zero,zero,xmm20[5,12,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,4,11],zero,zero,xmm9[0,7,14],zero,zero,xmm9[u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm20, %xmm9, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm9, %ymm0, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm18[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm17 = xmm17[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm17[1,8,15] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm9, %xmm17, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm9, %ymm0, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm9, %ymm20 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [16,9,2,19,12,5,22,23,24,17,26,27,20,29,30,31] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [8,1,2,19,12,5,22,15,0,9,26,11,4,29,14,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm2, %zmm9, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm19, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[2,9],zero,zero,zero,xmm2[5,12,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm19 = xmm19[4,11],zero,zero,xmm19[0,7,14],zero,zero,xmm19[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vporq %xmm2, %xmm19, %xmm2 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm9[u,u,u,u,u,u,u,u,u,3,10,1,8,15,6,13,20,27,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [1,2,4,6,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 224(%rdi), %ymm19 -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm9, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[0,7,10,13,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm9, %zmm10, %zmm9 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm2 {%k5} -; AVX512DQ-BW-FCP-NEXT: vextracti32x4 $1, %ymm16, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm9[3,10],zero,zero,zero,xmm9[6,13,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm16[5,12],zero,zero,xmm16[1,8,15],zero,zero,xmm16[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm9, %xmm10, %xmm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k5} = ymm18[u,u,u,u,u,u,u,u,u,4,11,2,9,0,7,14,21,28,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,4,6,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm10, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm11, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm9 {%k5} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm3[6,13],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm3, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm3[4,11],zero,zero,xmm3[0,7,14,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm10, %xmm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 {%k5} = ymm17[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [1,3,5,6,0,0,0,0] -; AVX512DQ-BW-FCP-NEXT: vpermd %ymm19, %ymm10, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm10, %zmm20, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm10, %zmm3 {%k5} -; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k2 # 2-byte Reload -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm8, %ymm6 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: movw $-512, %ax # imm = 0xFE00 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm5, %ymm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,2,9],zero,zero,zero,xmm4[5,12],zero,zero,xmm4[u,u,u] -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm4, %xmm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[u,u,u,u],zero,zero,xmm4[0,7,14],zero,zero,xmm4[3,10,u,u,u] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm4, %xmm4 -; AVX512DQ-BW-FCP-NEXT: movl $4186112, %edi # imm = 0x3FE000 -; AVX512DQ-BW-FCP-NEXT: kmovd %edi, %k1 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 {%k1} = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm16[u,u,u,u,u,u,u,u,u,u,u,2,9],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm22 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[0,7,14] +; AVX512DQ-BW-FCP-NEXT: vporq %xmm21, %xmm22, %xmm21 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm0, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm21, %ymm20 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm21 = [1,3,4,6,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm18, %ymm21, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm21 = xmm21[1,4,11,14,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm21, %zmm20, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm17 {%k5} +; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm19, %zmm0, %zmm17 {%k1} +; AVX512DQ-BW-FCP-NEXT: kmovw {{[-0-9]+}}(%r{{[sb]}}p), %k1 # 2-byte Reload +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm11, %ymm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,1,10,3,4,13,6,7,8,25,18,11,28,21,14,15] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm11, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm9 {%k6} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[u,u,u,u,2,9],zero,zero,zero,xmm9[5,12],zero,zero,xmm9[u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm9, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u],zero,zero,xmm9[0,7,14],zero,zero,xmm9[3,10,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm10, %xmm9, %xmm9 +; AVX512DQ-BW-FCP-NEXT: movl $4186112, %eax # imm = 0x3FE000 ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm14, %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm15, %zmm0, %zmm9 {%k1} -; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm6, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm6[u,u,u,u,u,u,0,7,14],zero,zero,xmm6[3,10],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm5, %xmm0 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm4, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti32x8 $1, %ymm5, %zmm0, %zmm3 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 {%k1} = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,1,8,15,22,29,20,27,18,25,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm4, %ymm2 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm5 {%k4} +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm5, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[2,9],zero,zero,zero,xmm3[5,12,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm5[u,u,4,11],zero,zero,xmm5[0,7,14],zero,zero,xmm5[u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm3, %xmm4, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm0, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm16[u,u,u,u,u,u,u,u,u,u,u,3,10],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm15[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm15[1,8,15] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm5, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm4, %ymm3 {%k7} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [16,9,2,19,20,13,6,23,24,17,26,27,28,21,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm2[6,13],zero,zero,xmm2[2,9],zero,zero,zero,xmm2[u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm2[4,11],zero,zero,xmm2[0,7,14,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpor %xmm4, %xmm2, %xmm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 {%k5} = ymm0[u,u,u,u,u,u,u,u,u,5,12,3,10,1,8,15,22,29,u,u,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm0 = [1,3,5,6,0,0,0,0] +; AVX512DQ-BW-FCP-NEXT: vpermd %ymm18, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,15,u,u,u,u,u,u,u,u,u,u,u,u] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $2, %xmm0, %zmm3, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm2 {%k5} ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rdi +; AVX512DQ-BW-FCP-NEXT: vextracti128 $1, %ymm7, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u],zero,zero,zero,xmm0[5,12],zero,zero,xmm0[1,8,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,0,7,14],zero,zero,xmm7[3,10],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm3, %xmm0 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm4 = [16,17,18,19,20,21,22,23,24,25,26,43,44,45,46,47] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm3, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,17,18,19,20,21,22,23] +; AVX512DQ-BW-FCP-NEXT: vpermi2d %zmm4, %zmm2, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rsi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rdx) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm8, (%rdx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, (%rcx) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, (%r8) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%r9) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm9, (%rdi) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%r9) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, (%rdi) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %wide.vec = load <448 x i8>, ptr %in.vec, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll index 9c9dca82f60ca..f626dfe5daf00 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll @@ -4093,139 +4093,125 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512: # %bb.0: ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%r10), %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa (%rax), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-NEXT: vmovdqa (%r9), %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa (%r8), %xmm5 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512-NEXT: vpermd %zmm1, %zmm26, %zmm30 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512-NEXT: movw $-30584, %r11w # imm = 0x8888 +; AVX512-NEXT: vmovdqa (%r10), %xmm0 +; AVX512-NEXT: vmovdqa (%rax), %xmm1 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm21 +; AVX512-NEXT: vmovdqa (%r9), %xmm0 +; AVX512-NEXT: vmovdqa (%r8), %xmm1 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm5 +; AVX512-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm26 +; AVX512-NEXT: vmovdqa64 %xmm0, %xmm27 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512-NEXT: vpermt2d %zmm10, %zmm17, %zmm9 +; AVX512-NEXT: movb $-86, %r11b ; AVX512-NEXT: kmovw %r11d, %k1 -; AVX512-NEXT: vpermd %zmm0, %zmm27, %zmm30 {%k1} -; AVX512-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512-NEXT: vpermd %zmm1, %zmm28, %zmm3 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512-NEXT: movw $8738, %r11w # imm = 0x2222 -; AVX512-NEXT: kmovw %r11d, %k2 -; AVX512-NEXT: vpermd %zmm0, %zmm29, %zmm3 {%k2} -; AVX512-NEXT: vmovdqa 32(%r10), %ymm15 -; AVX512-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11] -; AVX512-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512-NEXT: vpermd %zmm13, %zmm19, %zmm31 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512-NEXT: vpermd %zmm6, %zmm18, %zmm31 {%k1} -; AVX512-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512-NEXT: vpermd %zmm6, %zmm20, %zmm14 -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] -; AVX512-NEXT: vpermd %zmm4, %zmm21, %zmm14 {%k2} -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] -; AVX512-NEXT: vmovdqa 32(%r10), %xmm2 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15] -; AVX512-NEXT: vmovdqa 32(%rax), %xmm7 -; AVX512-NEXT: vpermd %zmm12, %zmm19, %zmm17 -; AVX512-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512-NEXT: vpermd %zmm4, %zmm18, %zmm17 {%k1} -; AVX512-NEXT: vmovdqa 32(%r8), %xmm15 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15] -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512-NEXT: vpermd %zmm0, %zmm20, %zmm16 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512-NEXT: vmovdqa 32(%r10), %ymm5 +; AVX512-NEXT: vmovdqa 32(%rax), %ymm10 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11] +; AVX512-NEXT: vmovdqa 32(%r9), %ymm13 +; AVX512-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512-NEXT: vpermt2d %zmm11, %zmm18, %zmm0 +; AVX512-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[8],ymm3[8],ymm12[9],ymm3[9],ymm12[10],ymm3[10],ymm12[11],ymm3[11] +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm11 +; AVX512-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] +; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm5 +; AVX512-NEXT: vmovdqa 32(%r10), %xmm6 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[12],ymm3[12],ymm12[13],ymm3[13],ymm12[14],ymm3[14],ymm12[15],ymm3[15] +; AVX512-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512-NEXT: vpermt2d %zmm3, %zmm19, %zmm13 +; AVX512-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512-NEXT: vpermt2d %zmm3, %zmm16, %zmm5 +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm14 ; AVX512-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512-NEXT: vpermd %zmm4, %zmm21, %zmm16 {%k2} -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512-NEXT: vpermd %zmm4, %zmm26, %zmm23 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX512-NEXT: vpermd %zmm4, %zmm27, %zmm23 {%k1} -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vpermd %zmm4, %zmm28, %zmm22 -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX512-NEXT: vpermd %zmm6, %zmm29, %zmm22 {%k2} -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] -; AVX512-NEXT: vpermd %zmm6, %zmm26, %zmm25 -; AVX512-NEXT: vpermd %zmm2, %zmm27, %zmm25 {%k1} -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-NEXT: vpermd %zmm0, %zmm28, %zmm24 -; AVX512-NEXT: vpermd %zmm2, %zmm29, %zmm24 {%k2} -; AVX512-NEXT: vmovdqa (%r10), %ymm0 -; AVX512-NEXT: vmovdqa (%rax), %ymm1 -; AVX512-NEXT: vmovdqa (%r9), %ymm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512-NEXT: vmovdqa (%r10), %ymm8 +; AVX512-NEXT: vmovdqa (%rax), %ymm7 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] +; AVX512-NEXT: vmovdqa (%r9), %ymm3 ; AVX512-NEXT: vmovdqa (%r8), %ymm4 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512-NEXT: vpermd %zmm6, %zmm19, %zmm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512-NEXT: vpermd %zmm7, %zmm18, %zmm6 {%k1} -; AVX512-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] -; AVX512-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512-NEXT: vpermd %zmm2, %zmm19, %zmm2 -; AVX512-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512-NEXT: vpermd %zmm0, %zmm18, %zmm2 {%k1} -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11] -; AVX512-NEXT: vpermd %zmm0, %zmm20, %zmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11] -; AVX512-NEXT: vpermd %zmm13, %zmm21, %zmm0 {%k2} -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15] -; AVX512-NEXT: vpermd %zmm4, %zmm20, %zmm4 -; AVX512-NEXT: vpermd %zmm1, %zmm21, %zmm4 {%k2} -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX512-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX512-NEXT: vpermd %zmm5, %zmm26, %zmm5 -; AVX512-NEXT: vpermd %zmm1, %zmm27, %zmm5 {%k1} -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512-NEXT: vpermd %zmm7, %zmm28, %zmm7 -; AVX512-NEXT: vpermd %zmm1, %zmm29, %zmm7 {%k2} -; AVX512-NEXT: movb $-86, %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm31, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm6 +; AVX512-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11] +; AVX512-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm10 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512-NEXT: vpermt2d %zmm6, %zmm18, %zmm3 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512-NEXT: vpermt2d %zmm2, %zmm19, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm24, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm11, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; @@ -4234,139 +4220,123 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm5 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm25 -; AVX512-FCP-NEXT: movw $-30584, %r11w # imm = 0x8888 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm5 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm26 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm27 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm10, %zmm19 +; AVX512-FCP-NEXT: movb $-86, %r11b ; AVX512-FCP-NEXT: kmovw %r11d, %k1 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm25 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm6, %zmm29 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512-FCP-NEXT: movw $8738, %r11w # imm = 0x2222 -; AVX512-FCP-NEXT: kmovw %r11d, %k2 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm29 {%k2} -; AVX512-FCP-NEXT: vmovdqa 32(%r10), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm19 {%k1} +; AVX512-FCP-NEXT: vmovdqa 32(%r10), %ymm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] ; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512-FCP-NEXT: vpermd %zmm11, %zmm16, %zmm27 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm17, %zmm27 {%k1} -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm18, %zmm30 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11] -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm19, %zmm30 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512-FCP-NEXT: vmovdqa 32(%r10), %xmm11 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm16, %zmm28 -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm28 {%k1} +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm15 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm10 +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15] +; AVX512-FCP-NEXT: vmovdqa 32(%r10), %xmm6 +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm4 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm13 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] ; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15] -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm15[4],ymm1[5],ymm15[5],ymm1[6],ymm15[6],ymm1[7],ymm15[7],ymm1[12],ymm15[12],ymm1[13],ymm15[13],ymm1[14],ymm15[14],ymm1[15],ymm15[15] +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm12 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm16, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm11 ; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm18, %zmm31 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm31 {%k2} -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm21 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm21 {%k1} -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm20 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm24, %zmm20 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm26, %zmm23 -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm23 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm24, %zmm22 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm22 {%k2} -; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm0 -; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm8 +; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm7 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] ; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm3 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm6 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11] -; AVX512-FCP-NEXT: vpermd %zmm11, %zmm16, %zmm11 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm11 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm6 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] ; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11] -; AVX512-FCP-NEXT: vpermd %zmm13, %zmm18, %zmm13 -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm15 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm19, %zmm13 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm16, %zmm1 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm17, %zmm1 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm18, %zmm2 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm2 {%k2} -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm3 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX512-FCP-NEXT: vpermd %zmm3, %zmm26, %zmm3 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm3 {%k1} -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm24, %zmm4 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm4 {%k2} -; AVX512-FCP-NEXT: movb $-86, %al -; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm30 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512-FCP-NEXT: vpermt2d %zmm6, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm20, %zmm4 ; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, 64(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; @@ -4374,139 +4344,125 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ: # %bb.0: ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa (%rax), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm26, %zmm30 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm27 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512DQ-NEXT: movw $-30584, %r11w # imm = 0x8888 +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rax), %xmm1 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm20 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm21 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm1 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm22 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm23 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm5 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm25 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm0 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm1 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm9 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm26 +; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm27 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm17, %zmm9 +; AVX512DQ-NEXT: movb $-86, %r11b ; AVX512DQ-NEXT: kmovw %r11d, %k1 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm27, %zmm30 {%k1} -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm8 -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm9 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm10 -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm11 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm28 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm28, %zmm3 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-NEXT: movw $8738, %r11w # imm = 0x2222 -; AVX512DQ-NEXT: kmovw %r11d, %k2 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm29, %zmm3 {%k2} -; AVX512DQ-NEXT: vmovdqa 32(%r10), %ymm15 -; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm2[0],ymm15[0],ymm2[1],ymm15[1],ymm2[2],ymm15[2],ymm2[3],ymm15[3],ymm2[8],ymm15[8],ymm2[9],ymm15[9],ymm2[10],ymm15[10],ymm2[11],ymm15[11] -; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm7 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm12 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm7[0],ymm12[1],ymm7[1],ymm12[2],ymm7[2],ymm12[3],ymm7[3],ymm12[8],ymm7[8],ymm12[9],ymm7[9],ymm12[10],ymm7[10],ymm12[11],ymm7[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512DQ-NEXT: vpermd %zmm13, %zmm19, %zmm31 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm18, %zmm31 {%k1} -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm13 -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm20, %zmm14 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm6 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm6[0],ymm13[0],ymm6[1],ymm13[1],ymm6[2],ymm13[2],ymm6[3],ymm13[3],ymm6[8],ymm13[8],ymm6[9],ymm13[9],ymm6[10],ymm13[10],ymm6[11],ymm13[11] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm21, %zmm14 {%k2} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm15[4],ymm2[5],ymm15[5],ymm2[6],ymm15[6],ymm2[7],ymm15[7],ymm2[12],ymm15[12],ymm2[13],ymm15[13],ymm2[14],ymm15[14],ymm2[15],ymm15[15] -; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm2 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm12[4],ymm7[4],ymm12[5],ymm7[5],ymm12[6],ymm7[6],ymm12[7],ymm7[7],ymm12[12],ymm7[12],ymm12[13],ymm7[13],ymm12[14],ymm7[14],ymm12[15],ymm7[15] -; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm7 -; AVX512DQ-NEXT: vpermd %zmm12, %zmm19, %zmm17 -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm12 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm18, %zmm17 {%k1} -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm15 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm13[4],ymm6[5],ymm13[5],ymm6[6],ymm13[6],ymm6[7],ymm13[7],ymm6[12],ymm13[12],ymm6[13],ymm13[13],ymm6[14],ymm13[14],ymm6[15],ymm13[15] -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm20, %zmm16 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqa 32(%r10), %ymm5 +; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm10 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm10[0],ymm5[0],ymm10[1],ymm5[1],ymm10[2],ymm5[2],ymm10[3],ymm5[3],ymm10[8],ymm5[8],ymm10[9],ymm5[9],ymm10[10],ymm5[10],ymm10[11],ymm5[11] +; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm13 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm15 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm18, %zmm0 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm3 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm12 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm3[0],ymm12[1],ymm3[1],ymm12[2],ymm3[2],ymm12[3],ymm3[3],ymm12[8],ymm3[8],ymm12[9],ymm3[9],ymm12[10],ymm3[10],ymm12[11],ymm3[11] +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm2 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm19, %zmm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm11 {%k1} +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm10[4],ymm5[4],ymm10[5],ymm5[5],ymm10[6],ymm5[6],ymm10[7],ymm5[7],ymm10[12],ymm5[12],ymm10[13],ymm5[13],ymm10[14],ymm5[14],ymm10[15],ymm5[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm5 +; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm6 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm12[4],ymm3[4],ymm12[5],ymm3[5],ymm12[6],ymm3[6],ymm12[7],ymm3[7],ymm12[12],ymm3[12],ymm12[13],ymm3[13],ymm12[14],ymm3[14],ymm12[15],ymm3[15] +; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm10 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm19, %zmm13 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm10[0],xmm6[0],xmm10[1],xmm6[1],xmm10[2],xmm6[2],xmm10[3],xmm6[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm16, %zmm5 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm12 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm14 ; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm21, %zmm16 {%k2} -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm15[0],xmm12[0],xmm15[1],xmm12[1],xmm15[2],xmm12[2],xmm15[3],xmm12[3] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm26, %zmm23 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm27, %zmm23 {%k1} -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm28, %zmm22 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm4 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm4[0],xmm13[0],xmm4[1],xmm13[1],xmm4[2],xmm13[2],xmm4[3],xmm13[3] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm29, %zmm22 {%k2} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm26, %zmm25 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm27, %zmm25 {%k1} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm13[4],xmm4[5],xmm13[5],xmm4[6],xmm13[6],xmm4[7],xmm13[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm28, %zmm24 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm29, %zmm24 {%k2} -; AVX512DQ-NEXT: vmovdqa (%r10), %ymm0 -; AVX512DQ-NEXT: vmovdqa (%rax), %ymm1 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm2 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm15 {%k1} +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm6[4],xmm10[5],xmm6[5],xmm10[6],xmm6[6],xmm10[7],xmm6[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-NEXT: vmovdqa (%r10), %ymm8 +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm7 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm3 ; AVX512DQ-NEXT: vmovdqa (%r8), %ymm4 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm2[0],ymm4[1],ymm2[1],ymm4[2],ymm2[2],ymm4[3],ymm2[3],ymm4[8],ymm2[8],ymm4[9],ymm2[9],ymm4[10],ymm2[10],ymm4[11],ymm2[11] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm19, %zmm6 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm18, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm4[4],ymm2[4],ymm4[5],ymm2[5],ymm4[6],ymm2[6],ymm4[7],ymm2[7],ymm4[12],ymm2[12],ymm4[13],ymm2[13],ymm4[14],ymm2[14],ymm4[15],ymm2[15] -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm19, %zmm2 -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm18, %zmm2 {%k1} -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm12[0],ymm4[0],ymm12[1],ymm4[1],ymm12[2],ymm4[2],ymm12[3],ymm4[3],ymm12[8],ymm4[8],ymm12[9],ymm4[9],ymm12[10],ymm4[10],ymm12[11],ymm4[11] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm20, %zmm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm1[0],ymm7[0],ymm1[1],ymm7[1],ymm1[2],ymm7[2],ymm1[3],ymm7[3],ymm1[8],ymm7[8],ymm1[9],ymm7[9],ymm1[10],ymm7[10],ymm1[11],ymm7[11] -; AVX512DQ-NEXT: vpermd %zmm13, %zmm21, %zmm0 {%k2} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm7[4],ymm1[5],ymm7[5],ymm1[6],ymm7[6],ymm1[7],ymm7[7],ymm1[12],ymm7[12],ymm1[13],ymm7[13],ymm1[14],ymm7[14],ymm1[15],ymm7[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm12[4],ymm4[4],ymm12[5],ymm4[5],ymm12[6],ymm4[6],ymm12[7],ymm4[7],ymm12[12],ymm4[12],ymm12[13],ymm4[13],ymm12[14],ymm4[14],ymm12[15],ymm4[15] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm20, %zmm4 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm21, %zmm4 {%k2} -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload -; AVX512DQ-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3] -; AVX512DQ-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm5 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX512DQ-NEXT: vpermd %zmm5, %zmm26, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm27, %zmm5 {%k1} -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512DQ-NEXT: vpermd %zmm7, %zmm28, %zmm7 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm29, %zmm7 {%k2} -; AVX512DQ-NEXT: movb $-86, %al -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqa64 %zmm30, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm31, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm18, %zmm6 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm2 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm12 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11] +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm1 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm0 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm19, %zmm10 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm10 {%k1} +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512DQ-NEXT: vpermt2d %zmm6, %zmm18, %zmm3 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm19, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm1 +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm7, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm22, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 64(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm3, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm10, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm11, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm9, 64(%rax) ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; @@ -4515,139 +4471,123 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm5 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm26 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm25 -; AVX512DQ-FCP-NEXT: movw $-30584, %r11w # imm = 0x8888 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm24 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm26 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm9 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm27 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm10, %zmm19 +; AVX512DQ-FCP-NEXT: movb $-86, %r11b ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm25 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm7 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm8 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm10 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm6, %zmm29 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm24 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-FCP-NEXT: movw $8738, %r11w # imm = 0x2222 -; AVX512DQ-FCP-NEXT: kmovw %r11d, %k2 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm29 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm19 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm5[0],ymm1[0],ymm5[1],ymm1[1],ymm5[2],ymm1[2],ymm5[3],ymm1[3],ymm5[8],ymm1[8],ymm5[9],ymm1[9],ymm5[10],ymm1[10],ymm5[11],ymm1[11] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm12 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm13 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm16, %zmm27 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm17, %zmm27 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm14 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm15[0],ymm1[1],ymm15[1],ymm1[2],ymm15[2],ymm1[3],ymm15[3],ymm1[8],ymm15[8],ymm1[9],ymm15[9],ymm1[10],ymm15[10],ymm1[11],ymm15[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm18, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm0[0],ymm14[0],ymm0[1],ymm14[1],ymm0[2],ymm14[2],ymm0[3],ymm14[3],ymm0[8],ymm14[8],ymm0[9],ymm14[9],ymm0[10],ymm14[10],ymm0[11],ymm14[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm19, %zmm30 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %xmm11 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm16, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm28 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm14 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm14[0],ymm12[0],ymm14[1],ymm12[1],ymm14[2],ymm12[2],ymm14[3],ymm12[3],ymm14[8],ymm12[8],ymm14[9],ymm12[9],ymm14[10],ymm12[10],ymm14[11],ymm12[11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm10, %zmm17, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm10 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, %zmm10 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm1[4],ymm5[5],ymm1[5],ymm5[6],ymm1[6],ymm5[7],ymm1[7],ymm5[12],ymm1[12],ymm5[13],ymm1[13],ymm5[14],ymm1[14],ymm5[15],ymm1[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm14[4],ymm12[4],ymm14[5],ymm12[5],ymm14[6],ymm12[6],ymm14[7],ymm12[7],ymm14[12],ymm12[12],ymm14[13],ymm12[13],ymm14[14],ymm12[14],ymm14[15],ymm12[15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %xmm6 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm4 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm11 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm13 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm12 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] ; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm14[4],ymm0[5],ymm14[5],ymm0[6],ymm14[6],ymm0[7],ymm14[7],ymm0[12],ymm14[12],ymm0[13],ymm14[13],ymm0[14],ymm14[14],ymm0[15],ymm14[15] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm13 -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm15[4],ymm1[5],ymm15[5],ymm1[6],ymm15[6],ymm1[7],ymm15[7],ymm1[12],ymm15[12],ymm1[13],ymm15[13],ymm1[14],ymm15[14],ymm1[15],ymm15[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm18, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm12 {%k1} +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm16, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm3 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm11 ; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm18, %zmm31 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm12 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm31 {%k2} -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm21 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm11[0],xmm3[1],xmm11[1],xmm3[2],xmm11[2],xmm3[3],xmm11[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm21 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm1 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm1, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm24, %zmm20 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm26, %zmm23 -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm26, %zmm23 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm24, %zmm22 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm22 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm11[0],xmm3[0],xmm11[1],xmm3[1],xmm11[2],xmm3[2],xmm11[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm3[4],xmm11[5],xmm3[5],xmm11[6],xmm3[6],xmm11[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm5 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm20, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm5 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm7 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11] ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm6 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm3[0],ymm6[1],ymm3[1],ymm6[2],ymm3[2],ymm6[3],ymm3[3],ymm6[8],ymm3[8],ymm6[9],ymm3[9],ymm6[10],ymm3[10],ymm6[11],ymm3[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm16, %zmm11 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm17, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm11 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm11[0],ymm13[1],ymm11[1],ymm13[2],ymm11[2],ymm13[3],ymm11[3],ymm13[8],ymm11[8],ymm13[9],ymm11[9],ymm13[10],ymm11[10],ymm13[11],ymm11[11] ; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm12 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm13 = ymm12[0],ymm2[0],ymm12[1],ymm2[1],ymm12[2],ymm2[2],ymm12[3],ymm2[3],ymm12[8],ymm2[8],ymm12[9],ymm2[9],ymm12[10],ymm2[10],ymm12[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm13, %zmm18, %zmm13 -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm15 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm14[0],ymm15[0],ymm14[1],ymm15[1],ymm14[2],ymm15[2],ymm14[3],ymm15[3],ymm14[8],ymm15[8],ymm14[9],ymm15[9],ymm14[10],ymm15[10],ymm14[11],ymm15[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm19, %zmm13 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm6[4],ymm3[4],ymm6[5],ymm3[5],ymm6[6],ymm3[6],ymm6[7],ymm3[7],ymm6[12],ymm3[12],ymm6[13],ymm3[13],ymm6[14],ymm3[14],ymm6[15],ymm3[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm16, %zmm1 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm17, %zmm1 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm14[4],ymm15[4],ymm14[5],ymm15[5],ymm14[6],ymm15[6],ymm14[7],ymm15[7],ymm14[12],ymm15[12],ymm14[13],ymm15[13],ymm14[14],ymm15[14],ymm14[15],ymm15[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm12[4],ymm2[4],ymm12[5],ymm2[5],ymm12[6],ymm2[6],ymm12[7],ymm2[7],ymm12[12],ymm2[12],ymm12[13],ymm2[13],ymm12[14],ymm2[14],ymm12[15],ymm2[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm18, %zmm2 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm19, %zmm2 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm3 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm3 = xmm5[0],mem[0],xmm5[1],mem[1],xmm5[2],mem[2],xmm5[3],mem[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm26, %zmm3 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm26, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm24, %zmm4 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm24, %zmm4 {%k2} -; AVX512DQ-FCP-NEXT: movb $-86, %al -; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm29 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm30 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm31 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm18, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm7[4],ymm8[4],ymm7[5],ymm8[5],ymm7[6],ymm8[6],ymm7[7],ymm8[7],ymm7[12],ymm8[12],ymm7[13],ymm8[13],ymm7[14],ymm8[14],ymm7[15],ymm8[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm6, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm13[4],ymm11[4],ymm13[5],ymm11[5],ymm13[6],ymm11[6],ymm13[7],ymm11[7],ymm13[12],ymm11[12],ymm13[13],ymm11[13],ymm13[14],ymm11[14],ymm13[15],ymm11[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm18, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm20, %zmm4 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 128(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 320(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, 64(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 320(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 256(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, 64(%rax) ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; @@ -7777,1095 +7717,959 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve ; ; AVX512-LABEL: store_i16_stride8_vf64: ; AVX512: # %bb.0: -; AVX512-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX512-NEXT: subq $392, %rsp # imm = 0x188 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: vmovdqa (%r10), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 64(%r10), %xmm4 -; AVX512-NEXT: vmovdqa (%rax), %xmm1 -; AVX512-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa 64(%rax), %xmm3 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512-NEXT: vmovdqa (%r9), %xmm0 -; AVX512-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-NEXT: vmovdqa (%r8), %xmm7 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512-NEXT: vpermd %zmm2, %zmm30, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512-NEXT: movw $-30584, %r11w # imm = 0x8888 -; AVX512-NEXT: kmovw %r11d, %k2 -; AVX512-NEXT: vpermd %zmm1, %zmm29, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: movw $8738, %r11w # imm = 0x2222 +; AVX512-NEXT: vmovdqa (%r10), %xmm1 +; AVX512-NEXT: vmovdqa (%rax), %xmm2 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512-NEXT: vmovdqa64 %xmm1, %xmm21 +; AVX512-NEXT: vmovdqa (%r9), %xmm2 +; AVX512-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512-NEXT: vpermt2d %zmm0, %zmm17, %zmm2 +; AVX512-NEXT: movb $-86, %r11b ; AVX512-NEXT: kmovw %r11d, %k1 -; AVX512-NEXT: vmovdqa 96(%r10), %ymm2 -; AVX512-NEXT: vmovdqa 96(%rax), %ymm5 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] -; AVX512-NEXT: vmovdqa 96(%r9), %ymm8 -; AVX512-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512-NEXT: vpermd %zmm10, %zmm19, %zmm0 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512-NEXT: vpermd %zmm1, %zmm18, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 96(%rcx), %ymm10 -; AVX512-NEXT: vmovdqa 96(%rdx), %ymm11 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 96(%r10), %ymm0 +; AVX512-NEXT: vmovdqa 96(%rax), %ymm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512-NEXT: vmovdqa 96(%r9), %ymm3 +; AVX512-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm5 +; AVX512-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX512-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] ; AVX512-NEXT: vmovdqa 96(%rsi), %ymm12 ; AVX512-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512-NEXT: vpermd %zmm1, %zmm16, %zmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512-NEXT: vpermd %zmm14, %zmm17, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] -; AVX512-NEXT: vpermd %zmm5, %zmm19, %zmm0 -; AVX512-NEXT: vpermd %zmm2, %zmm18, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512-NEXT: vpermd %zmm5, %zmm16, %zmm31 -; AVX512-NEXT: vpermd %zmm2, %zmm17, %zmm31 {%k1} -; AVX512-NEXT: vmovdqa 96(%r10), %xmm2 -; AVX512-NEXT: vmovdqa 96(%rax), %xmm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX512-NEXT: vmovdqa 96(%r9), %xmm10 -; AVX512-NEXT: vmovdqa 96(%r8), %xmm11 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512-NEXT: vpermd %zmm12, %zmm30, %zmm0 -; AVX512-NEXT: vpermd %zmm9, %zmm29, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512-NEXT: vpermd %zmm8, %zmm30, %zmm0 -; AVX512-NEXT: vpermd %zmm2, %zmm29, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 64(%r10), %ymm2 -; AVX512-NEXT: vmovdqa 64(%rax), %ymm9 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11] -; AVX512-NEXT: vmovdqa 64(%r9), %ymm10 -; AVX512-NEXT: vmovdqa 64(%r8), %ymm11 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512-NEXT: vpermd %zmm12, %zmm19, %zmm0 -; AVX512-NEXT: vpermd %zmm8, %zmm18, %zmm0 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512-NEXT: vmovdqa 64(%rdx), %ymm15 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512-NEXT: vpermt2d %zmm10, %zmm19, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 96(%r10), %xmm0 +; AVX512-NEXT: vmovdqa 96(%rax), %xmm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-NEXT: vmovdqa 96(%r9), %xmm3 +; AVX512-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm5 +; AVX512-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512-NEXT: vmovdqa 96(%rsi), %xmm13 +; AVX512-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512-NEXT: vpermt2d %zmm10, %zmm17, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512-NEXT: vpermt2d %zmm0, %zmm17, %zmm2 +; AVX512-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 64(%r10), %ymm6 +; AVX512-NEXT: vmovdqa 64(%rax), %ymm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] +; AVX512-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm10 +; AVX512-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] ; AVX512-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX512-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512-NEXT: vpermd %zmm8, %zmm16, %zmm26 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] -; AVX512-NEXT: vpermd %zmm8, %zmm17, %zmm26 {%k1} -; AVX512-NEXT: vmovdqa 64(%r9), %xmm8 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[12],ymm2[12],ymm9[13],ymm2[13],ymm9[14],ymm2[14],ymm9[15],ymm2[15] -; AVX512-NEXT: vmovdqa 64(%r8), %xmm9 -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512-NEXT: vpermd %zmm10, %zmm19, %zmm5 -; AVX512-NEXT: vpermd %zmm2, %zmm18, %zmm5 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15] +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512-NEXT: vpermt2d %zmm14, %zmm19, %zmm7 +; AVX512-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] ; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512-NEXT: vpermd %zmm0, %zmm16, %zmm24 -; AVX512-NEXT: vpermd %zmm2, %zmm17, %zmm24 {%k1} -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512-NEXT: vpermd %zmm1, %zmm30, %zmm1 -; AVX512-NEXT: vpermd %zmm0, %zmm29, %zmm1 {%k2} -; AVX512-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] -; AVX512-NEXT: vpermd %zmm9, %zmm19, %zmm28 -; AVX512-NEXT: vpermd %zmm3, %zmm18, %zmm28 {%k2} -; AVX512-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX512-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] -; AVX512-NEXT: vpermd %zmm3, %zmm16, %zmm23 -; AVX512-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11] -; AVX512-NEXT: vpermd %zmm6, %zmm17, %zmm23 {%k1} -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] -; AVX512-NEXT: vpermd %zmm1, %zmm19, %zmm25 -; AVX512-NEXT: vpermd %zmm0, %zmm18, %zmm25 {%k2} -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm12[4],ymm3[5],ymm12[5],ymm3[6],ymm12[6],ymm3[7],ymm12[7],ymm3[12],ymm12[12],ymm3[13],ymm12[13],ymm3[14],ymm12[14],ymm3[15],ymm12[15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15] -; AVX512-NEXT: vpermd %zmm0, %zmm16, %zmm21 -; AVX512-NEXT: vpermd %zmm3, %zmm17, %zmm21 {%k1} -; AVX512-NEXT: vmovdqa (%r10), %ymm3 -; AVX512-NEXT: vmovdqa (%r9), %ymm4 -; AVX512-NEXT: vmovdqa (%r8), %ymm6 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] -; AVX512-NEXT: vpermd %zmm8, %zmm19, %zmm27 -; AVX512-NEXT: vmovdqa (%rax), %ymm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11] -; AVX512-NEXT: vpermd %zmm9, %zmm18, %zmm27 {%k2} -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[12],ymm3[12],ymm8[13],ymm3[13],ymm8[14],ymm3[14],ymm8[15],ymm3[15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15] -; AVX512-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512-NEXT: vpermd %zmm4, %zmm19, %zmm20 -; AVX512-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512-NEXT: vpermd %zmm3, %zmm18, %zmm20 {%k2} -; AVX512-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512-NEXT: vpermd %zmm8, %zmm16, %zmm18 -; AVX512-NEXT: vmovdqa (%rdx), %ymm8 -; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] -; AVX512-NEXT: vpermd %zmm9, %zmm17, %zmm18 {%k1} -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] -; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX512-NEXT: vmovdqa 32(%r10), %xmm4 -; AVX512-NEXT: vpermd %zmm3, %zmm16, %zmm16 +; AVX512-NEXT: vpermt2d %zmm2, %zmm19, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vmovdqa 64(%r10), %xmm7 +; AVX512-NEXT: vmovdqa 64(%rax), %xmm2 +; AVX512-NEXT: vmovdqa 64(%r9), %xmm8 +; AVX512-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512-NEXT: vpermt2d %zmm4, %zmm16, %zmm5 +; AVX512-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512-NEXT: vmovdqa 64(%rsi), %xmm14 +; AVX512-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm9 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; AVX512-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm15 +; AVX512-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512-NEXT: vmovdqa 32(%r10), %ymm8 +; AVX512-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] +; AVX512-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX512-NEXT: vpermt2d %zmm2, %zmm18, %zmm6 +; AVX512-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11] +; AVX512-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512-NEXT: vpermt2d %zmm9, %zmm19, %zmm14 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] +; AVX512-NEXT: vpermt2d %zmm4, %zmm18, %zmm5 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512-NEXT: vpermt2d %zmm2, %zmm19, %zmm13 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512-NEXT: vmovdqa 32(%r10), %xmm10 ; AVX512-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX512-NEXT: vpermd %zmm6, %zmm17, %zmm16 {%k1} -; AVX512-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512-NEXT: vpermd %zmm10, %zmm30, %zmm19 -; AVX512-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512-NEXT: vpermd %zmm2, %zmm29, %zmm19 {%k2} -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512-NEXT: vpermd %zmm2, %zmm30, %zmm10 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512-NEXT: vpermd %zmm9, %zmm29, %zmm10 {%k2} -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512-NEXT: vpermd %zmm4, %zmm30, %zmm17 -; AVX512-NEXT: vpermd %zmm3, %zmm29, %zmm17 {%k2} -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512-NEXT: vmovdqa64 %xmm0, %xmm22 -; AVX512-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload -; AVX512-NEXT: # xmm6 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; AVX512-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512-NEXT: vpermd %zmm6, %zmm30, %zmm8 -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512-NEXT: vpermd %zmm4, %zmm29, %zmm8 {%k2} -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512-NEXT: vpermd %zmm6, %zmm29, %zmm7 -; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512-NEXT: vpermd %zmm3, %zmm30, %zmm7 {%k1} -; AVX512-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX512-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512-NEXT: vpermd %zmm6, %zmm29, %zmm6 -; AVX512-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX512-NEXT: vmovdqa 96(%rdx), %xmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-NEXT: vpermd %zmm14, %zmm30, %zmm6 {%k1} -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512-NEXT: vpermd %zmm1, %zmm29, %zmm5 -; AVX512-NEXT: vpermd %zmm0, %zmm30, %zmm5 {%k1} -; AVX512-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpermd %zmm3, %zmm29, %zmm4 -; AVX512-NEXT: vmovdqa 64(%rcx), %xmm14 -; AVX512-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512-NEXT: vpermd %zmm15, %zmm30, %zmm4 {%k1} -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vpermd %zmm1, %zmm29, %zmm3 -; AVX512-NEXT: vpermd %zmm0, %zmm30, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512-NEXT: vpermd %zmm14, %zmm29, %zmm14 -; AVX512-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX512-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX512-NEXT: vpermd %zmm11, %zmm30, %zmm14 {%k1} -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512-NEXT: vpermd %zmm1, %zmm29, %zmm1 -; AVX512-NEXT: vpermd %zmm0, %zmm30, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512-NEXT: vpermd %zmm2, %zmm29, %zmm2 -; AVX512-NEXT: vpermd %zmm0, %zmm30, %zmm2 {%k1} -; AVX512-NEXT: movb $-86, %al -; AVX512-NEXT: kmovw %eax, %k1 -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm26 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} -; AVX512-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm25, %zmm21 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm20, %zmm16 {%k1} -; AVX512-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm6 +; AVX512-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm12 +; AVX512-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512-NEXT: vpermt2d %zmm1, %zmm16, %zmm3 +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX512-NEXT: vpermt2d %zmm1, %zmm17, %zmm4 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512-NEXT: vmovdqa (%r10), %ymm10 +; AVX512-NEXT: vmovdqa (%rax), %ymm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11] +; AVX512-NEXT: vmovdqa (%r9), %ymm5 +; AVX512-NEXT: vmovdqa (%r8), %ymm6 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512-NEXT: vpermt2d %zmm0, %zmm18, %zmm7 +; AVX512-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512-NEXT: vpermt2d %zmm11, %zmm19, %zmm0 +; AVX512-NEXT: vmovdqa64 %zmm7, %zmm0 {%k1} +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512-NEXT: vpermt2d %zmm3, %zmm18, %zmm5 +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512-NEXT: vpermt2d %zmm3, %zmm19, %zmm1 +; AVX512-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512-NEXT: vmovdqa64 %xmm23, %xmm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512-NEXT: vpermt2d %zmm2, %zmm16, %zmm3 +; AVX512-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512-NEXT: vmovdqa64 %xmm25, %xmm5 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512-NEXT: vmovdqa64 %xmm27, %xmm6 +; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512-NEXT: vpermt2d %zmm2, %zmm17, %zmm5 +; AVX512-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512-NEXT: vmovdqa64 %zmm21, 448(%rax) -; AVX512-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512-NEXT: vmovdqa64 %zmm3, 576(%rax) -; AVX512-NEXT: vmovdqa64 %zmm4, 512(%rax) -; AVX512-NEXT: vmovdqa64 %zmm24, 704(%rax) -; AVX512-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512-NEXT: vmovdqa64 %zmm0, 896(%rax) -; AVX512-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX512-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512-NEXT: addq $392, %rsp # imm = 0x188 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq ; ; AVX512-FCP-LABEL: store_i16_stride8_vf64: ; AVX512-FCP: # %bb.0: -; AVX512-FCP-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512-FCP-NEXT: subq $328, %rsp # imm = 0x148 ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512-FCP-NEXT: vmovdqa (%r10), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa (%rax), %xmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 ; AVX512-FCP-NEXT: vmovdqa (%r9), %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm1 -; AVX512-FCP-NEXT: movw $-30584, %r11w # imm = 0x8888 -; AVX512-FCP-NEXT: kmovw %r11d, %k2 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm1 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm17 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512-FCP-NEXT: movw $8738, %r11w # imm = 0x2222 +; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-FCP-NEXT: vmovdqa64 %xmm4, %xmm27 +; AVX512-FCP-NEXT: vmovdqa64 %xmm3, %xmm28 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0] +; AVX512-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm20 +; AVX512-FCP-NEXT: movb $-86, %r11b ; AVX512-FCP-NEXT: kmovw %r11d, %k1 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqa 96(%r10), %ymm5 -; AVX512-FCP-NEXT: vmovdqa 96(%rax), %ymm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm13 -; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm15 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512-FCP-NEXT: vpermd %zmm11, %zmm23, %zmm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm22, %zmm0 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512-FCP-NEXT: vpermd %zmm6, %zmm20, %zmm10 -; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm0 -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] -; AVX512-FCP-NEXT: vpermd %zmm9, %zmm21, %zmm10 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm23, %zmm5 -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm22, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm13 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%r10), %ymm0 -; AVX512-FCP-NEXT: vmovdqa 64(%rax), %ymm1 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm23, %zmm5 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm22, %zmm5 {%k2} -; AVX512-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill -; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11] -; AVX512-FCP-NEXT: vpermd %zmm8, %zmm20, %zmm16 -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm8 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11] -; AVX512-FCP-NEXT: vpermd %zmm12, %zmm21, %zmm16 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm19 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm19 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm5[4],ymm9[5],ymm5[5],ymm9[6],ymm5[6],ymm9[7],ymm5[7],ymm9[12],ymm5[12],ymm9[13],ymm5[13],ymm9[14],ymm5[14],ymm9[15],ymm5[15] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm18 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512-FCP-NEXT: vmovdqa 96(%r10), %ymm0 +; AVX512-FCP-NEXT: vmovdqa 96(%rax), %ymm1 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm4 +; AVX512-FCP-NEXT: vmovdqa 96(%r9), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 96(%r8), %ymm4 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm23, %zmm25 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm22, %zmm25 {%k2} -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] -; AVX512-FCP-NEXT: vpermd %zmm8, %zmm20, %zmm24 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512-FCP-NEXT: vpermd %zmm12, %zmm21, %zmm24 {%k1} +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %ymm12 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm27 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm27 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm26 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm26 {%k1} -; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm0 -; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm2 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512-FCP-NEXT: vpermd %zmm3, %zmm23, %zmm28 -; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm22, %zmm28 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm23 -; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm23 {%k2} -; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512-FCP-NEXT: vpermd %zmm3, %zmm20, %zmm22 -; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm21, %zmm22 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm1 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm20, %zmm20 -; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm21, %zmm20 {%k1} -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm21 -; AVX512-FCP-NEXT: vmovdqa 96(%r10), %xmm2 -; AVX512-FCP-NEXT: vmovdqa 96(%rax), %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm14, %zmm21 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm1 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm29 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm29 {%k2} -; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm30 -; AVX512-FCP-NEXT: vmovdqa 64(%r10), %xmm2 -; AVX512-FCP-NEXT: vmovdqa 64(%rax), %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm14, %zmm30 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm31 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm31 {%k2} -; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm6 -; AVX512-FCP-NEXT: vmovdqa 32(%r10), %xmm3 -; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm4 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm14, %zmm6 {%k2} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm1 -; AVX512-FCP-NEXT: vpermd %zmm3, %zmm14, %zmm1 {%k2} -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-FCP-NEXT: vpermd %zmm3, %zmm14, %zmm9 -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm9 {%k2} -; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm8 -; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512-FCP-NEXT: vpermd %zmm12, %zmm7, %zmm8 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm2 -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 96(%r10), %xmm0 +; AVX512-FCP-NEXT: vmovdqa 96(%rax), %xmm1 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512-FCP-NEXT: vmovdqa 96(%r9), %xmm3 +; AVX512-FCP-NEXT: vmovdqa 96(%r8), %xmm4 ; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512-FCP-NEXT: vpermd %zmm5, %zmm7, %zmm5 -; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm12 -; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm14 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX512-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm5 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512-FCP-NEXT: vpermd %zmm3, %zmm7, %zmm3 -; AVX512-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; AVX512-FCP-NEXT: vpermd %zmm12, %zmm7, %zmm12 -; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512-FCP-NEXT: vpermd %zmm15, %zmm7, %zmm12 {%k1} -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; AVX512-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm4 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512-FCP-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] -; AVX512-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm11 -; AVX512-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm11 {%k1} -; AVX512-FCP-NEXT: movb $-86, %al -; AVX512-FCP-NEXT: kmovw %eax, %k1 -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm21, %zmm8 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm29, %zmm2 {%k1} -; AVX512-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm30, %zmm5 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512-FCP-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512-FCP-NEXT: vmovdqa 96(%rsi), %xmm13 +; AVX512-FCP-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX512-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm2 +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 64(%r10), %ymm6 +; AVX512-FCP-NEXT: vmovdqa 64(%rax), %ymm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] +; AVX512-FCP-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm12 +; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm7 +; AVX512-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512-FCP-NEXT: vpermt2d %zmm5, %zmm17, %zmm3 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm19, %zmm0 +; AVX512-FCP-NEXT: vmovdqa 64(%r10), %xmm8 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vmovdqa 64(%rax), %xmm2 +; AVX512-FCP-NEXT: vmovdqa 64(%r9), %xmm7 +; AVX512-FCP-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm16, %zmm5 +; AVX512-FCP-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512-FCP-NEXT: vmovdqa 64(%rsi), %xmm13 +; AVX512-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm9 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm15 +; AVX512-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512-FCP-NEXT: vmovdqa 32(%r10), %ymm8 +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm6 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11] +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm14 +; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] +; AVX512-FCP-NEXT: vpermt2d %zmm4, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm19, %zmm13 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512-FCP-NEXT: vmovdqa 32(%r10), %xmm10 +; AVX512-FCP-NEXT: vmovdqa 32(%rax), %xmm3 +; AVX512-FCP-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512-FCP-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm6 +; AVX512-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512-FCP-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm12 ; AVX512-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} -; AVX512-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm3 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX512-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm4 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512-FCP-NEXT: vmovdqa (%r10), %ymm10 +; AVX512-FCP-NEXT: vmovdqa (%rax), %ymm3 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11] +; AVX512-FCP-NEXT: vmovdqa (%r9), %ymm5 +; AVX512-FCP-NEXT: vmovdqa (%r8), %ymm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm7 +; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm0 +; AVX512-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k1} +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm17, %zmm5 +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm1 +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm24, %xmm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm3 +; AVX512-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512-FCP-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512-FCP-NEXT: vmovdqa64 %xmm27, %xmm5 +; AVX512-FCP-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm5 +; AVX512-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm26, 448(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm3, 576(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm5, 512(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm18, 704(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm2, 832(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm8, 768(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 960(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm10, 896(%rax) -; AVX512-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512-FCP-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512-FCP-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512-FCP-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512-FCP-NEXT: addq $328, %rsp # imm = 0x148 ; AVX512-FCP-NEXT: vzeroupper ; AVX512-FCP-NEXT: retq ; ; AVX512DQ-LABEL: store_i16_stride8_vf64: ; AVX512DQ: # %bb.0: -; AVX512DQ-NEXT: subq $504, %rsp # imm = 0x1F8 +; AVX512DQ-NEXT: subq $392, %rsp # imm = 0x188 ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-NEXT: vmovdqa (%r10), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%r10), %xmm4 -; AVX512DQ-NEXT: vmovdqa (%rax), %xmm1 -; AVX512DQ-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%rax), %xmm3 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; AVX512DQ-NEXT: vmovdqa (%r9), %xmm0 -; AVX512DQ-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-NEXT: vmovdqa (%r8), %xmm7 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512DQ-NEXT: vpermd %zmm2, %zmm30, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512DQ-NEXT: movw $-30584, %r11w # imm = 0x8888 -; AVX512DQ-NEXT: kmovw %r11d, %k2 -; AVX512DQ-NEXT: vpermd %zmm1, %zmm29, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: movw $8738, %r11w # imm = 0x2222 +; AVX512DQ-NEXT: vmovdqa (%r10), %xmm1 +; AVX512DQ-NEXT: vmovdqa (%rax), %xmm2 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm20 +; AVX512DQ-NEXT: vmovdqa64 %xmm1, %xmm21 +; AVX512DQ-NEXT: vmovdqa (%r9), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm22 +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm23 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm24 +; AVX512DQ-NEXT: vmovdqa64 %xmm2, %xmm25 +; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-NEXT: vmovdqa64 %xmm4, %xmm26 +; AVX512DQ-NEXT: vmovdqa64 %xmm3, %xmm27 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm17, %zmm2 +; AVX512DQ-NEXT: movb $-86, %r11b ; AVX512DQ-NEXT: kmovw %r11d, %k1 -; AVX512DQ-NEXT: vmovdqa 96(%r10), %ymm2 -; AVX512DQ-NEXT: vmovdqa 96(%rax), %ymm5 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] -; AVX512DQ-NEXT: vmovdqa 96(%r9), %ymm8 -; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm9 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512DQ-NEXT: vpermd %zmm10, %zmm19, %zmm0 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm18, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm10 -; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm11 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 96(%r10), %ymm0 +; AVX512DQ-NEXT: vmovdqa 96(%rax), %ymm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; AVX512DQ-NEXT: vmovdqa 96(%r9), %ymm3 +; AVX512DQ-NEXT: vmovdqa 96(%r8), %ymm4 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm18, %zmm5 +; AVX512DQ-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX512DQ-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] ; AVX512DQ-NEXT: vmovdqa 96(%rsi), %ymm12 ; AVX512DQ-NEXT: vmovdqa 96(%rdi), %ymm13 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm16 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm16, %zmm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm17 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512DQ-NEXT: vpermd %zmm14, %zmm17, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] -; AVX512DQ-NEXT: vpermd %zmm5, %zmm19, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm18, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] -; AVX512DQ-NEXT: vpermd %zmm5, %zmm16, %zmm31 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm17, %zmm31 {%k1} -; AVX512DQ-NEXT: vmovdqa 96(%r10), %xmm2 -; AVX512DQ-NEXT: vmovdqa 96(%rax), %xmm8 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm2[0],xmm8[1],xmm2[1],xmm8[2],xmm2[2],xmm8[3],xmm2[3] -; AVX512DQ-NEXT: vmovdqa 96(%r9), %xmm10 -; AVX512DQ-NEXT: vmovdqa 96(%r8), %xmm11 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3] -; AVX512DQ-NEXT: vpermd %zmm12, %zmm30, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm9, %zmm29, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm8[4],xmm2[4],xmm8[5],xmm2[5],xmm8[6],xmm2[6],xmm8[7],xmm2[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm8 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm30, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm29, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%r10), %ymm2 -; AVX512DQ-NEXT: vmovdqa 64(%rax), %ymm9 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm2[0],ymm9[1],ymm2[1],ymm9[2],ymm2[2],ymm9[3],ymm2[3],ymm9[8],ymm2[8],ymm9[9],ymm2[9],ymm9[10],ymm2[10],ymm9[11],ymm2[11] -; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm10 -; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm11 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11] -; AVX512DQ-NEXT: vpermd %zmm12, %zmm19, %zmm0 -; AVX512DQ-NEXT: vpermd %zmm8, %zmm18, %zmm0 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm12 -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm15 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm19, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm1 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 96(%r10), %xmm0 +; AVX512DQ-NEXT: vmovdqa 96(%rax), %xmm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-NEXT: vmovdqa 96(%r9), %xmm3 +; AVX512DQ-NEXT: vmovdqa 96(%r8), %xmm4 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm5 +; AVX512DQ-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512DQ-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm10 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm13 +; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm14 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3] +; AVX512DQ-NEXT: vpermt2d %zmm10, %zmm17, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm17, %zmm2 +; AVX512DQ-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%r10), %ymm6 +; AVX512DQ-NEXT: vmovdqa 64(%rax), %ymm5 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] +; AVX512DQ-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512DQ-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm10 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm18, %zmm10 +; AVX512DQ-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] ; AVX512DQ-NEXT: vmovdqa 64(%rsi), %ymm1 ; AVX512DQ-NEXT: vmovdqa 64(%rdi), %ymm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm26 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm15[0],ymm12[0],ymm15[1],ymm12[1],ymm15[2],ymm12[2],ymm15[3],ymm12[3],ymm15[8],ymm12[8],ymm15[9],ymm12[9],ymm15[10],ymm12[10],ymm15[11],ymm12[11] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm17, %zmm26 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%r9), %xmm8 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm9[4],ymm2[4],ymm9[5],ymm2[5],ymm9[6],ymm2[6],ymm9[7],ymm2[7],ymm9[12],ymm2[12],ymm9[13],ymm2[13],ymm9[14],ymm2[14],ymm9[15],ymm2[15] -; AVX512DQ-NEXT: vmovdqa 64(%r8), %xmm9 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm10 = ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15] -; AVX512DQ-NEXT: vpermd %zmm10, %zmm19, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm18, %zmm5 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm15[4],ymm12[4],ymm15[5],ymm12[5],ymm15[6],ymm12[6],ymm15[7],ymm12[7],ymm15[12],ymm12[12],ymm15[13],ymm12[13],ymm15[14],ymm12[14],ymm15[15],ymm12[15] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-NEXT: vpermt2d %zmm14, %zmm19, %zmm7 +; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm7 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512DQ-NEXT: vpermt2d %zmm5, %zmm18, %zmm3 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] ; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm16, %zmm24 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm17, %zmm24 {%k1} -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm30, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm29, %zmm1 {%k2} -; AVX512DQ-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm10 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] -; AVX512DQ-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm4 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm8 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm4[0],ymm8[1],ymm4[1],ymm8[2],ymm4[2],ymm8[3],ymm4[3],ymm8[8],ymm4[8],ymm8[9],ymm4[9],ymm8[10],ymm4[10],ymm8[11],ymm4[11] -; AVX512DQ-NEXT: vpermd %zmm9, %zmm19, %zmm28 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm18, %zmm28 {%k2} -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm9 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm11[0],ymm9[0],ymm11[1],ymm9[1],ymm11[2],ymm9[2],ymm11[3],ymm9[3],ymm11[8],ymm9[8],ymm11[9],ymm9[9],ymm11[10],ymm9[10],ymm11[11],ymm9[11] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm16, %zmm23 -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm12 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm3 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm3[0],ymm12[0],ymm3[1],ymm12[1],ymm3[2],ymm12[2],ymm3[3],ymm12[3],ymm3[8],ymm12[8],ymm3[9],ymm12[9],ymm3[10],ymm12[10],ymm3[11],ymm12[11] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm17, %zmm23 {%k1} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm8[4],ymm4[4],ymm8[5],ymm4[5],ymm8[6],ymm4[6],ymm8[7],ymm4[7],ymm8[12],ymm4[12],ymm8[13],ymm4[13],ymm8[14],ymm4[14],ymm8[15],ymm4[15] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm19, %zmm25 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm18, %zmm25 {%k2} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm12[4],ymm3[5],ymm12[5],ymm3[6],ymm12[6],ymm3[7],ymm12[7],ymm3[12],ymm12[12],ymm3[13],ymm12[13],ymm3[14],ymm12[14],ymm3[15],ymm12[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm11[4],ymm9[4],ymm11[5],ymm9[5],ymm11[6],ymm9[6],ymm11[7],ymm9[7],ymm11[12],ymm9[12],ymm11[13],ymm9[13],ymm11[14],ymm9[14],ymm11[15],ymm9[15] -; AVX512DQ-NEXT: vpermd %zmm0, %zmm16, %zmm21 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm17, %zmm21 {%k1} -; AVX512DQ-NEXT: vmovdqa (%r10), %ymm3 -; AVX512DQ-NEXT: vmovdqa (%r9), %ymm4 -; AVX512DQ-NEXT: vmovdqa (%r8), %ymm6 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm6[0],ymm4[0],ymm6[1],ymm4[1],ymm6[2],ymm4[2],ymm6[3],ymm4[3],ymm6[8],ymm4[8],ymm6[9],ymm4[9],ymm6[10],ymm4[10],ymm6[11],ymm4[11] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm19, %zmm27 -; AVX512DQ-NEXT: vmovdqa (%rax), %ymm8 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm3[0],ymm8[1],ymm3[1],ymm8[2],ymm3[2],ymm8[3],ymm3[3],ymm8[8],ymm3[8],ymm8[9],ymm3[9],ymm8[10],ymm3[10],ymm8[11],ymm3[11] -; AVX512DQ-NEXT: vpermd %zmm9, %zmm18, %zmm27 {%k2} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm8[4],ymm3[4],ymm8[5],ymm3[5],ymm8[6],ymm3[6],ymm8[7],ymm3[7],ymm8[12],ymm3[12],ymm8[13],ymm3[13],ymm8[14],ymm3[14],ymm8[15],ymm3[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm6[4],ymm4[4],ymm6[5],ymm4[5],ymm6[6],ymm4[6],ymm6[7],ymm4[7],ymm6[12],ymm4[12],ymm6[13],ymm4[13],ymm6[14],ymm4[14],ymm6[15],ymm4[15] -; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm6 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm19, %zmm20 -; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm18, %zmm20 {%k2} -; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[8],ymm4[8],ymm3[9],ymm4[9],ymm3[10],ymm4[10],ymm3[11],ymm4[11] -; AVX512DQ-NEXT: vpermd %zmm8, %zmm16, %zmm18 -; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm8 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11] -; AVX512DQ-NEXT: vpermd %zmm9, %zmm17, %zmm18 {%k1} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm6 = ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[12],ymm4[12],ymm3[13],ymm4[13],ymm3[14],ymm4[14],ymm3[15],ymm4[15] -; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm4 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm16, %zmm16 +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm19, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vmovdqa 64(%r10), %xmm7 +; AVX512DQ-NEXT: vmovdqa 64(%rax), %xmm2 +; AVX512DQ-NEXT: vmovdqa 64(%r9), %xmm8 +; AVX512DQ-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3] +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm16, %zmm5 +; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm14 +; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm9 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm7[4],xmm2[5],xmm7[5],xmm2[6],xmm7[6],xmm2[7],xmm7[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm15 +; AVX512DQ-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512DQ-NEXT: vmovdqa 32(%r10), %ymm8 +; AVX512DQ-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] +; AVX512DQ-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm18, %zmm6 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11] +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-NEXT: vpermt2d %zmm9, %zmm19, %zmm14 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] +; AVX512DQ-NEXT: vpermt2d %zmm4, %zmm18, %zmm5 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm19, %zmm13 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512DQ-NEXT: vmovdqa 32(%r10), %xmm10 ; AVX512DQ-NEXT: vmovdqa 32(%rax), %xmm3 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm17, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm6 -; AVX512DQ-NEXT: vpermd %zmm10, %zmm30, %zmm19 -; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm8 -; AVX512DQ-NEXT: vpermd %zmm2, %zmm29, %zmm19 {%k2} -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3] -; AVX512DQ-NEXT: vpermd %zmm2, %zmm30, %zmm10 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; AVX512DQ-NEXT: vpermd %zmm9, %zmm29, %zmm10 {%k2} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] -; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm0 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512DQ-NEXT: vpermd %zmm4, %zmm30, %zmm17 -; AVX512DQ-NEXT: vpermd %zmm3, %zmm29, %zmm17 {%k2} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] -; AVX512DQ-NEXT: vmovdqa64 %xmm0, %xmm22 -; AVX512DQ-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm4 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm4 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm9 -; AVX512DQ-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm7, %xmm6 # 16-byte Folded Reload -; AVX512DQ-NEXT: # xmm6 = xmm7[0],mem[0],xmm7[1],mem[1],xmm7[2],mem[2],xmm7[3],mem[3] -; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQ-NEXT: vpermd %zmm6, %zmm30, %zmm8 -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512DQ-NEXT: vpermd %zmm4, %zmm29, %zmm8 {%k2} -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm29 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm29, %zmm7 -; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm30 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm30, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqa 96(%rsi), %xmm3 -; AVX512DQ-NEXT: vmovdqa 96(%rdi), %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] -; AVX512DQ-NEXT: vpermd %zmm6, %zmm29, %zmm6 -; AVX512DQ-NEXT: vmovdqa 96(%rcx), %xmm1 -; AVX512DQ-NEXT: vmovdqa 96(%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-NEXT: vpermd %zmm14, %zmm30, %zmm6 {%k1} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm29, %zmm5 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm30, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqa 64(%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 64(%rdi), %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpermd %zmm3, %zmm29, %zmm4 -; AVX512DQ-NEXT: vmovdqa 64(%rcx), %xmm14 -; AVX512DQ-NEXT: vmovdqa 64(%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512DQ-NEXT: vpermd %zmm15, %zmm30, %zmm4 {%k1} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm29, %zmm3 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm30, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm1 -; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm2 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm14 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; AVX512DQ-NEXT: vpermd %zmm14, %zmm29, %zmm14 -; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; AVX512DQ-NEXT: vpermd %zmm11, %zmm30, %zmm14 {%k1} -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-NEXT: vpermd %zmm1, %zmm29, %zmm1 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm30, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm0 -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] -; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3] -; AVX512DQ-NEXT: vpermd %zmm2, %zmm29, %zmm2 -; AVX512DQ-NEXT: vpermd %zmm0, %zmm30, %zmm2 {%k1} -; AVX512DQ-NEXT: movb $-86, %al -; AVX512DQ-NEXT: kmovw %eax, %k1 -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm0, %zmm7 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm0 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm31 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm6 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm5 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm26 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm24 {%k1} -; AVX512DQ-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload -; AVX512DQ-NEXT: vmovdqa64 %zmm9, %zmm4 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm19, %zmm3 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm28, %zmm23 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm25, %zmm21 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm10, %zmm14 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm17, %zmm1 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm27, %zmm18 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm20, %zmm16 {%k1} -; AVX512DQ-NEXT: vmovdqa64 %zmm8, %zmm2 {%k1} +; AVX512DQ-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512DQ-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm6 +; AVX512DQ-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512DQ-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512DQ-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512DQ-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm12 +; AVX512DQ-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm16, %zmm3 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX512DQ-NEXT: vpermt2d %zmm1, %zmm17, %zmm4 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-NEXT: vmovdqa (%r10), %ymm10 +; AVX512DQ-NEXT: vmovdqa (%rax), %ymm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11] +; AVX512DQ-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQ-NEXT: vmovdqa (%r8), %ymm6 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512DQ-NEXT: vpermt2d %zmm0, %zmm18, %zmm7 +; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512DQ-NEXT: vpermt2d %zmm11, %zmm19, %zmm0 +; AVX512DQ-NEXT: vmovdqa64 %zmm7, %zmm0 {%k1} +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm18, %zmm5 +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512DQ-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512DQ-NEXT: vpermt2d %zmm3, %zmm19, %zmm1 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512DQ-NEXT: vmovdqa64 %xmm20, %xmm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm21, %xmm3 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm23, %xmm5 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm16, %zmm3 +; AVX512DQ-NEXT: vmovdqa64 %xmm24, %xmm2 +; AVX512DQ-NEXT: vmovdqa64 %xmm25, %xmm5 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512DQ-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512DQ-NEXT: vmovdqa64 %xmm27, %xmm6 +; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-NEXT: vpermt2d %zmm2, %zmm17, %zmm5 +; AVX512DQ-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512DQ-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm16, 192(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm18, 128(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm1, 320(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm14, 256(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm21, 448(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm23, 384(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm3, 576(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm4, 512(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm24, 704(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm26, 640(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm5, 832(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm6, 768(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm31, 960(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm0, 896(%rax) -; AVX512DQ-NEXT: vmovdqa64 %zmm7, 64(%rax) -; AVX512DQ-NEXT: addq $504, %rsp # imm = 0x1F8 +; AVX512DQ-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm12, 256(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512DQ-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512DQ-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQ-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-NEXT: vmovaps %zmm0, 64(%rax) +; AVX512DQ-NEXT: addq $392, %rsp # imm = 0x188 ; AVX512DQ-NEXT: vzeroupper ; AVX512DQ-NEXT: retq ; ; AVX512DQ-FCP-LABEL: store_i16_stride8_vf64: ; AVX512DQ-FCP: # %bb.0: -; AVX512DQ-FCP-NEXT: subq $264, %rsp # imm = 0x108 +; AVX512DQ-FCP-NEXT: subq $328, %rsp # imm = 0x148 ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 ; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %xmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm21 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm1, %xmm22 ; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm14 = [0,0,0,0,0,0,1,1,2,2,2,2,0,0,3,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: movw $-30584, %r11w # imm = 0x8888 -; AVX512DQ-FCP-NEXT: kmovw %r11d, %k2 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm1 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1 -; AVX512DQ-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,0,1,1,1,1,0,0,2,0,3,0,3,0,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm2, %zmm17 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,1,1,1,1,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-FCP-NEXT: movw $8738, %r11w # imm = 0x2222 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm23 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm24 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm16 = [0,0,0,16,0,0,1,17,2,2,2,18,0,0,3,19] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm25 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm2, %xmm26 +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm4, %xmm27 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm3, %xmm28 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [0,16,1,0,1,17,0,0,2,18,3,0,3,19,0,0] +; AVX512DQ-FCP-NEXT: vpermi2d %zmm0, %zmm2, %zmm20 +; AVX512DQ-FCP-NEXT: movb $-86, %r11b ; AVX512DQ-FCP-NEXT: kmovw %r11d, %k1 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r10), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %ymm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm13 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm15 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm15[0],ymm13[0],ymm15[1],ymm13[1],ymm15[2],ymm13[2],ymm15[3],ymm13[3],ymm15[8],ymm13[8],ymm15[9],ymm13[9],ymm15[10],ymm13[10],ymm15[11],ymm13[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm23 = [0,1,4,5,4,5,5,7,2,1,6,5,6,5,7,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm23, %zmm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm22 = [0,0,4,4,4,4,6,5,0,2,4,6,4,6,6,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm22, %zmm0 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm20 = [4,5,1,3,5,7,5,7,6,5,3,3,7,7,7,7] -; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm20, %zmm10 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm0 -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm21 = [4,4,2,1,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm0[0],ymm3[0],ymm0[1],ymm3[1],ymm0[2],ymm3[2],ymm0[3],ymm3[3],ymm0[8],ymm3[8],ymm0[9],ymm3[9],ymm0[10],ymm3[10],ymm0[11],ymm3[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm9, %zmm21, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm15[4],ymm13[4],ymm15[5],ymm13[5],ymm15[6],ymm13[6],ymm15[7],ymm13[7],ymm15[12],ymm13[12],ymm15[13],ymm13[13],ymm15[14],ymm13[14],ymm15[15],ymm13[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm22, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm3[4],ymm0[5],ymm3[5],ymm0[6],ymm3[6],ymm0[7],ymm3[7],ymm0[12],ymm3[12],ymm0[13],ymm3[13],ymm0[14],ymm3[14],ymm0[15],ymm3[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm13 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r10), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm20 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r10), %ymm0 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %ymm1 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %ymm4 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm23, %zmm5 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm22, %zmm5 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm5, (%rsp) # 64-byte Spill -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm5 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm5[0],ymm9[1],ymm5[1],ymm9[2],ymm5[2],ymm9[3],ymm5[3],ymm9[8],ymm5[8],ymm9[9],ymm5[9],ymm9[10],ymm5[10],ymm9[11],ymm5[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm20, %zmm16 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm8 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm8[0],ymm2[0],ymm8[1],ymm2[1],ymm8[2],ymm2[2],ymm8[3],ymm2[3],ymm8[8],ymm2[8],ymm8[9],ymm2[9],ymm8[10],ymm2[10],ymm8[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm21, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm19 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm19 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm8[4],ymm2[4],ymm8[5],ymm2[5],ymm8[6],ymm2[6],ymm8[7],ymm2[7],ymm8[12],ymm2[12],ymm8[13],ymm2[13],ymm8[14],ymm2[14],ymm8[15],ymm2[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm9[4],ymm5[4],ymm9[5],ymm5[5],ymm9[6],ymm5[6],ymm9[7],ymm5[7],ymm9[12],ymm5[12],ymm9[13],ymm5[13],ymm9[14],ymm5[14],ymm9[15],ymm5[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm18 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm1 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm5 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm23, %zmm25 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm22, %zmm25 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm2 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm5 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm8 = ymm5[0],ymm2[0],ymm5[1],ymm2[1],ymm5[2],ymm2[2],ymm5[3],ymm2[3],ymm5[8],ymm2[8],ymm5[9],ymm2[9],ymm5[10],ymm2[10],ymm5[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm8, %zmm20, %zmm24 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm8 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm9 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm21, %zmm24 {%k1} +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm17 = [0,1,4,20,4,5,5,21,2,1,6,22,6,5,7,23] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %ymm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm6[0],ymm2[0],ymm6[1],ymm2[1],ymm6[2],ymm2[2],ymm6[3],ymm2[3],ymm6[8],ymm2[8],ymm6[9],ymm2[9],ymm6[10],ymm2[10],ymm6[11],ymm2[11] +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm19 = [4,20,1,3,5,21,5,7,6,22,3,3,7,23,7,7] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %ymm12 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm13[0],ymm12[0],ymm13[1],ymm12[1],ymm13[2],ymm12[2],ymm13[3],ymm12[3],ymm13[8],ymm12[8],ymm13[9],ymm12[9],ymm13[10],ymm12[10],ymm13[11],ymm12[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] ; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm27 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm27 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm5[4],ymm2[4],ymm5[5],ymm2[5],ymm5[6],ymm2[6],ymm5[7],ymm2[7],ymm5[12],ymm2[12],ymm5[13],ymm2[13],ymm5[14],ymm2[14],ymm5[15],ymm2[15] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm20, %zmm26 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm21, %zmm26 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm0 -; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm2 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm23, %zmm28 -; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm22, %zmm28 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[12],ymm0[12],ymm3[13],ymm0[13],ymm3[14],ymm0[14],ymm3[15],ymm0[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] -; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm2 -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm23, %zmm23 -; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm1 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm22, %zmm23 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm20, %zmm22 -; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm4 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm21, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm3[4],ymm2[4],ymm3[5],ymm2[5],ymm3[6],ymm2[6],ymm3[7],ymm2[7],ymm3[12],ymm2[12],ymm3[13],ymm2[13],ymm3[14],ymm2[14],ymm3[15],ymm2[15] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm1 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm20, %zmm20 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm0 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm21, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm3 = [0,0,0,0,0,0,1,0,2,2,2,2,0,0,3,3] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm3, %zmm21 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%r10), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm14, %zmm21 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm1 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm29 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm29 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm30 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%r10), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm14, %zmm30 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm31 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm31 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm0 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm2 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm1, %zmm14, %zmm6 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm4 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm14, %zmm6 {%k2} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm14, %zmm1 -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm14, %zmm1 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm2 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm2 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm3 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm14, %zmm9 -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm14, %zmm9 {%k2} -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm2 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm3 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm5 = [0,0,1,0,1,0,0,0,2,2,3,3,3,3,0,0] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm5, %zmm8 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm5 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm7, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm2, %zmm7, %zmm2 -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm3 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm4 +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm1 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm6[4],ymm2[4],ymm6[5],ymm2[5],ymm6[6],ymm2[6],ymm6[7],ymm2[7],ymm6[12],ymm2[12],ymm6[13],ymm2[13],ymm6[14],ymm2[14],ymm6[15],ymm2[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm12[4],ymm13[5],ymm12[5],ymm13[6],ymm12[6],ymm13[7],ymm12[7],ymm13[12],ymm12[12],ymm13[13],ymm12[13],ymm13[14],ymm12[14],ymm13[15],ymm12[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm19, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r10), %xmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rax), %xmm1 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r9), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%r8), %xmm4 ; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm7, %zmm5 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm12 -; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm14 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm11 = xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm3, %zmm7, %zmm3 -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm4 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm11 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm12, %zmm7, %zmm12 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm14 -; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm0 -; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm15 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm15, %zmm7, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] -; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm4 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3] -; AVX512DQ-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload -; AVX512DQ-FCP-NEXT: vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm11, %xmm11 # 16-byte Folded Reload -; AVX512DQ-FCP-NEXT: # xmm11 = xmm11[0],mem[0],xmm11[1],mem[1],xmm11[2],mem[2],xmm11[3],mem[3] -; AVX512DQ-FCP-NEXT: vpermd %zmm11, %zmm7, %zmm11 -; AVX512DQ-FCP-NEXT: vpermd %zmm0, %zmm7, %zmm11 {%k1} -; AVX512DQ-FCP-NEXT: movb $-86, %al -; AVX512DQ-FCP-NEXT: kmovw %eax, %k1 -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm17 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm10 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm13 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm21, %zmm8 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm29, %zmm2 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqu64 (%rsp), %zmm0 # 64-byte Reload -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, %zmm16 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm19, %zmm18 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm30, %zmm5 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm31, %zmm3 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm25, %zmm24 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm27, %zmm26 {%k1} +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rcx), %xmm2 +; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm18 = [0,16,1,0,1,17,0,0,2,18,3,3,3,19,0,0] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdx), %xmm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3] +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rsi), %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa 96(%rdi), %xmm15 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm7 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm12, %zmm18, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm16, %zmm1 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm2[4],xmm6[5],xmm2[5],xmm6[6],xmm2[6],xmm6[7],xmm2[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm15[4],xmm13[4],xmm15[5],xmm13[5],xmm15[6],xmm13[6],xmm15[7],xmm13[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm18, %zmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm2 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r10), %ymm6 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %ymm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm12 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm12 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %ymm13 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm15 = ymm13[0],ymm2[0],ymm13[1],ymm2[1],ymm13[2],ymm2[2],ymm13[3],ymm2[3],ymm13[8],ymm2[8],ymm13[9],ymm2[9],ymm13[10],ymm2[10],ymm13[11],ymm2[11] +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm15, %zmm19, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, %zmm7 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm7, (%rsp) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm5, %zmm17, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm13[4],ymm2[4],ymm13[5],ymm2[5],ymm13[6],ymm2[6],ymm13[7],ymm2[7],ymm13[12],ymm2[12],ymm13[13],ymm2[13],ymm13[14],ymm2[14],ymm13[15],ymm2[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r10), %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rax), %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r9), %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%r8), %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm16, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rcx), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdx), %xmm6 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rsi), %xmm13 +; AVX512DQ-FCP-NEXT: vmovdqa 64(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm9 = xmm0[0],xmm13[0],xmm0[1],xmm13[1],xmm0[2],xmm13[2],xmm0[3],xmm13[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm9 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm9 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm8[4],xmm2[5],xmm8[5],xmm2[6],xmm8[6],xmm2[7],xmm8[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm2 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm2 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm15 = xmm0[4],xmm13[4],xmm0[5],xmm13[5],xmm0[6],xmm13[6],xmm0[7],xmm13[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm15 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, %zmm15 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %ymm4 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm4[0],ymm8[0],ymm4[1],ymm8[1],ymm4[2],ymm8[2],ymm4[3],ymm8[3],ymm4[8],ymm8[8],ymm4[9],ymm8[9],ymm4[10],ymm8[10],ymm4[11],ymm8[11] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %ymm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %ymm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm6 = ymm5[0],ymm3[0],ymm5[1],ymm3[1],ymm5[2],ymm3[2],ymm5[3],ymm3[3],ymm5[8],ymm3[8],ymm5[9],ymm3[9],ymm5[10],ymm3[10],ymm5[11],ymm3[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm17, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %ymm7 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm9 = ymm7[0],ymm2[0],ymm7[1],ymm2[1],ymm7[2],ymm2[2],ymm7[3],ymm2[3],ymm7[8],ymm2[8],ymm7[9],ymm2[9],ymm7[10],ymm2[10],ymm7[11],ymm2[11] +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %ymm1 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm14 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm9, %zmm19, %zmm14 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm14 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm4[4],ymm8[4],ymm4[5],ymm8[5],ymm4[6],ymm8[6],ymm4[7],ymm8[7],ymm4[12],ymm8[12],ymm4[13],ymm8[13],ymm4[14],ymm8[14],ymm4[15],ymm8[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm5[4],ymm3[4],ymm5[5],ymm3[5],ymm5[6],ymm3[6],ymm5[7],ymm3[7],ymm5[12],ymm3[12],ymm5[13],ymm3[13],ymm5[14],ymm3[14],ymm5[15],ymm3[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm4, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm2 = ymm7[4],ymm2[4],ymm7[5],ymm2[5],ymm7[6],ymm2[6],ymm7[7],ymm2[7],ymm7[12],ymm2[12],ymm7[13],ymm2[13],ymm7[14],ymm2[14],ymm7[15],ymm2[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm13 = ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm19, %zmm13 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm13 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r10), %xmm10 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rax), %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r9), %xmm4 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%r8), %xmm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm10[0],xmm3[1],xmm10[1],xmm3[2],xmm10[2],xmm3[3],xmm10[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm6 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm6 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rcx), %xmm7 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512DQ-FCP-NEXT: vmovdqa 32(%rdi), %xmm0 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3] +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm12 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm12 ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, %zmm12 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, %zmm4 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm28, %zmm22 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm23, %zmm20 {%k1} -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm9, %zmm11 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm10[4],xmm3[5],xmm10[5],xmm3[6],xmm10[6],xmm3[7],xmm10[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm3 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} xmm4 = xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm1, %zmm18, %zmm4 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm4 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa (%r10), %ymm10 +; AVX512DQ-FCP-NEXT: vmovdqa (%rax), %ymm3 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm10[0],ymm3[1],ymm10[1],ymm3[2],ymm10[2],ymm3[3],ymm10[3],ymm3[8],ymm10[8],ymm3[9],ymm10[9],ymm3[10],ymm10[10],ymm3[11],ymm10[11] +; AVX512DQ-FCP-NEXT: vmovdqa (%r9), %ymm5 +; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %ymm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm7 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm0, %zmm17, %zmm7 +; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm8 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm9 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm11 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11] +; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm2 +; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm11, %zmm19, %zmm0 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm7, %zmm0 {%k1} +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm3[4],ymm10[4],ymm3[5],ymm10[5],ymm3[6],ymm10[6],ymm3[7],ymm10[7],ymm3[12],ymm10[12],ymm3[13],ymm10[13],ymm3[14],ymm10[14],ymm3[15],ymm10[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm5 = ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm17, %zmm5 +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[12],ymm8[12],ymm9[13],ymm8[13],ymm9[14],ymm8[14],ymm9[15],ymm8[15] +; AVX512DQ-FCP-NEXT: vpunpckhwd {{.*#+}} ymm1 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm3, %zmm19, %zmm1 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, %zmm1 {%k1} +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm21, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm22, %xmm3 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm23, %xmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm24, %xmm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm16, %zmm3 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm25, %xmm2 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm26, %xmm5 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm27, %xmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %xmm28, %xmm6 +; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; AVX512DQ-FCP-NEXT: vpermt2d %zmm2, %zmm18, %zmm5 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, %zmm5 {%k1} ; AVX512DQ-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm11, (%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 192(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm1, 192(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) ; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm12, 256(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm26, 448(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm24, 384(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm3, 576(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, 512(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm18, 704(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm16, 640(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm2, 832(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm8, 768(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 960(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm10, 896(%rax) -; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) -; AVX512DQ-FCP-NEXT: addq $264, %rsp # imm = 0x108 +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm13, 448(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm14, 384(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm15, 576(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 512(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 704(%rax) +; AVX512DQ-FCP-NEXT: vmovups (%rsp), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 640(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 832(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 768(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 960(%rax) +; AVX512DQ-FCP-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload +; AVX512DQ-FCP-NEXT: vmovaps %zmm0, 896(%rax) +; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm20, 64(%rax) +; AVX512DQ-FCP-NEXT: addq $328, %rsp # imm = 0x148 ; AVX512DQ-FCP-NEXT: vzeroupper ; AVX512DQ-FCP-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index 86efcf9c57616..ad9db98711a62 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -1190,8 +1190,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero ; AVX2-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] ; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] @@ -1233,8 +1232,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm3, %ymm4, %ymm3 +; AVX2-FP-NEXT: vpor %ymm3, %ymm4, %ymm3 ; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,1,2,2] ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1] ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255] @@ -1461,20 +1459,18 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero ; AVX512BW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[3,19],zero,zero,zero,ymm4[28,20],zero,zero,zero,ymm4[29,21],zero,zero,zero,ymm4[30,22] +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero ; AVX512BW-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 -; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22] ; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512BW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512BW-NEXT: vporq %zmm3, %zmm4, %zmm3 ; AVX512BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512BW-NEXT: vpermd %zmm2, %zmm4, %zmm4 ; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 @@ -1531,20 +1527,18 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9],zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28],zero,zero,zero ; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm3, %ymm3 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,2,2,0] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[3,19],zero,zero,zero,ymm4[28,20],zero,zero,zero,ymm4[29,21],zero,zero,zero,ymm4[30,22] +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rcx), %ymm1, %ymm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm1[6],zero,zero,zero,zero,ymm1[7],zero,zero,zero,zero,ymm1[8],zero,zero,zero,zero,ymm1[9,25],zero,zero,zero,zero,ymm1[26],zero,zero,zero,zero,ymm1[27],zero,zero,zero,zero,ymm1[28] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero ; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm5, %ymm4 -; AVX512DQ-BW-NEXT: movl $831283992, %eax # imm = 0x318C6318 -; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm4 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm0[0,2,2,0] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[0,8],zero,zero,zero,ymm3[1,9],zero,zero,zero,ymm3[2,10],zero,zero,zero,ymm3[3,19],zero,zero,zero,ymm3[28,20],zero,zero,zero,ymm3[29,21],zero,zero,zero,ymm3[30,22] ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[19,27],zero,zero,zero,ymm5[20,28],zero,zero,zero,ymm5[21,29],zero,zero,zero -; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm5, %ymm3 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm3, %zmm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm4, %zmm5, %zmm4 +; AVX512DQ-BW-NEXT: vporq %zmm3, %zmm4, %zmm3 ; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} zmm4 = [0,0,0,0,0,0,1,1,1,1,2,2,2,2,2,2] ; AVX512DQ-BW-NEXT: vpermd %zmm2, %zmm4, %zmm4 ; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll index 6d499e17bfbc6..03f5b90002d34 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll @@ -2996,94 +2996,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512BW-NEXT: vmovdqa (%rdx), %ymm5 -; AVX512BW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm5 ; AVX512BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-NEXT: vpshufb %ymm4, %ymm3, %ymm7 -; AVX512BW-NEXT: vpshufb %ymm4, %ymm2, %ymm4 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-NEXT: vpermw %ymm7, %ymm8, %ymm7 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-NEXT: vpshufb %ymm8, %ymm6, %ymm4 -; AVX512BW-NEXT: vpshufb %ymm8, %ymm5, %ymm9 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-NEXT: vpermw %ymm9, %ymm10, %ymm9 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512BW-NEXT: movl $1227114788, %r10d # imm = 0x49244924 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-NEXT: vpshufb %ymm6, %ymm3, %ymm7 +; AVX512BW-NEXT: vpshufb %ymm6, %ymm2, %ymm6 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX512BW-NEXT: vpshufb %ymm7, %ymm4, %ymm9 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512BW-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX512BW-NEXT: vpshufb %ymm7, %ymm0, %ymm10 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] -; AVX512BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 -; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm4 {%k1} -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512BW-NEXT: vpshufb %xmm8, %xmm9, %xmm10 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512BW-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm8 {%k1} +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512BW-NEXT: vpshufb %ymm9, %ymm1, %ymm6 +; AVX512BW-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-NEXT: movw $9362, %r10w # imm = 0x2492 +; AVX512BW-NEXT: kmovd %r10d, %k2 +; AVX512BW-NEXT: vmovdqu16 %ymm6, %ymm8 {%k2} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512BW-NEXT: vpermi2w %ymm6, %ymm10, %ymm11 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm6 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512BW-NEXT: vpermi2w %zmm11, %zmm6, %zmm10 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm6 +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-NEXT: vpshufb %xmm10, %xmm8, %xmm11 ; AVX512BW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-NEXT: vpermi2w %zmm8, %zmm13, %zmm14 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-NEXT: vpshufb %xmm8, %xmm10, %xmm13 -; AVX512BW-NEXT: vpshufb %xmm8, %xmm12, %xmm8 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm13[8],xmm8[9],xmm13[9],xmm8[10],xmm13[10],xmm8[11],xmm13[11],xmm8[12],xmm13[12],xmm8[13],xmm13[13],xmm8[14],xmm13[14],xmm8[15],xmm13[15] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512BW-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm8 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] -; AVX512BW-NEXT: movl $1227105426, %ecx # imm = 0x49242492 -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512BW-NEXT: vpshufb %xmm7, %xmm8, %xmm13 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm15 -; AVX512BW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 +; AVX512BW-NEXT: vpshufb %xmm10, %xmm12, %xmm10 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512BW-NEXT: vpshufb %xmm7, %xmm11, %xmm13 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512BW-NEXT: vpshufb %xmm7, %xmm14, %xmm7 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-NEXT: vpermi2w %zmm7, %zmm13, %zmm16 -; AVX512BW-NEXT: movl $613566756, %ecx # imm = 0x24924924 -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512BW-NEXT: vpermi2w %zmm5, %zmm6, %zmm7 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512BW-NEXT: vmovdqu16 %ymm10, %ymm7 {%k1} +; AVX512BW-NEXT: vmovdqa (%r9), %xmm10 +; AVX512BW-NEXT: vpshufb %xmm9, %xmm10, %xmm13 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm15 +; AVX512BW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512BW-NEXT: vmovdqu16 %ymm9, %ymm7 {%k2} +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512BW-NEXT: vpermi2w %ymm9, %ymm13, %ymm16 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-NEXT: vpermi2w %ymm9, %ymm16, %ymm13 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0] +; AVX512BW-NEXT: vpermi2w %ymm9, %ymm8, %ymm11 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512BW-NEXT: vpermi2w %ymm8, %ymm11, %ymm9 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13] +; AVX512BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm2 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31] ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] -; AVX512BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512BW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm5 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm0 +; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -3092,93 +3086,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm5 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm7 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm4 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-FCP-NEXT: vpermw %ymm7, %ymm8, %ymm7 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm4 -; AVX512BW-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm9 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-FCP-NEXT: vpermw %ymm9, %ymm10, %ymm9 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: movl $1227114788, %r10d # imm = 0x49244924 -; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm10 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] -; AVX512BW-FCP-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512BW-FCP-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm7 +; AVX512BW-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm6 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX512BW-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm9 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm8 {%k1} +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm6 +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512BW-FCP-NEXT: movw $9362, %r10w # imm = 0x2492 +; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm8 {%k2} +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512BW-FCP-NEXT: vpermi2w %ymm6, %ymm10, %ymm11 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm6 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512BW-FCP-NEXT: vpermi2w %zmm11, %zmm6, %zmm10 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm11 +; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm11 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm12 ; AVX512BW-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm10 ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-FCP-NEXT: vpermi2w %zmm10, %zmm14, %zmm15 -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm10 -; AVX512BW-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm8 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3],xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-FCP-NEXT: vpermi2w %zmm8, %zmm10, %zmm14 -; AVX512BW-FCP-NEXT: movl $1227105426, %ecx # imm = 0x49242492 -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm8 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm13 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm7 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm7 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm13 ; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm15 -; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-FCP-NEXT: vpermi2w %zmm7, %zmm10, %zmm16 -; AVX512BW-FCP-NEXT: movl $613566756, %ecx # imm = 0x24924924 -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512BW-FCP-NEXT: vpermi2w %zmm5, %zmm6, %zmm7 +; AVX512BW-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm9 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm7 {%k2} +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512BW-FCP-NEXT: vpermi2w %ymm9, %ymm13, %ymm16 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-FCP-NEXT: vpermi2w %ymm9, %ymm16, %ymm13 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0] +; AVX512BW-FCP-NEXT: vpermi2w %ymm9, %ymm8, %ymm11 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512BW-FCP-NEXT: vpermi2w %ymm8, %ymm11, %ymm9 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13] +; AVX512BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm2 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31] ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] -; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512BW-FCP-NEXT: movl $1227133513, %ecx # imm = 0x49249249 -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm3 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -3187,94 +3176,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm5 ; AVX512DQ-BW-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-BW-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm3, %ymm7 -; AVX512DQ-BW-NEXT: vpshufb %ymm4, %ymm2, %ymm4 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm8, %ymm7 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm6, %ymm4 -; AVX512DQ-BW-NEXT: vpshufb %ymm8, %ymm5, %ymm9 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQ-BW-NEXT: vpermw %ymm9, %ymm10, %ymm9 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512DQ-BW-NEXT: movl $1227114788, %r10d # imm = 0x49244924 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm3, %ymm7 +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm2, %ymm6 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm4, %ymm9 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-BW-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm0, %ymm10 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] -; AVX512DQ-BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 -; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm4 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm9 -; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm9, %xmm10 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm11 -; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm11, %xmm8 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm10 +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm8 {%k1} +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm1, %ymm6 +; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-BW-NEXT: movw $9362, %r10w # imm = 0x2492 +; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm6, %ymm8 {%k2} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-BW-NEXT: vpermi2w %ymm6, %ymm10, %ymm11 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm6 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512DQ-BW-NEXT: vpermi2w %zmm11, %zmm6, %zmm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm6 +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm8 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm8, %xmm11 ; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm12 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-NEXT: vpermi2w %zmm8, %zmm13, %zmm14 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} xmm8 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm10, %xmm13 -; AVX512DQ-BW-NEXT: vpshufb %xmm8, %xmm12, %xmm8 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm8[8],xmm13[8],xmm8[9],xmm13[9],xmm8[10],xmm13[10],xmm8[11],xmm13[11],xmm8[12],xmm13[12],xmm8[13],xmm13[13],xmm8[14],xmm13[14],xmm8[15],xmm13[15] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7] -; AVX512DQ-BW-NEXT: vprold $16, %xmm13, %xmm13 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm13, %zmm8 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] -; AVX512DQ-BW-NEXT: movl $1227105426, %ecx # imm = 0x49242492 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm8, %xmm13 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm15 -; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm15, %xmm7 +; AVX512DQ-BW-NEXT: vpshufb %xmm10, %xmm12, %xmm10 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm11 +; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm11, %xmm13 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-BW-NEXT: vpshufb %xmm7, %xmm14, %xmm7 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-NEXT: vpermi2w %zmm7, %zmm13, %zmm16 -; AVX512DQ-BW-NEXT: movl $613566756, %ecx # imm = 0x24924924 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512DQ-BW-NEXT: vpermi2w %zmm5, %zmm6, %zmm7 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm10, %ymm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm10 +; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm10, %xmm13 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm15 +; AVX512DQ-BW-NEXT: vpshufb %xmm9, %xmm15, %xmm9 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm9, %ymm7 {%k2} +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-BW-NEXT: vpermi2w %ymm9, %ymm13, %ymm16 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-NEXT: vpermi2w %ymm9, %ymm16, %ymm13 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0] +; AVX512DQ-BW-NEXT: vpermi2w %ymm9, %ymm8, %ymm11 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512DQ-BW-NEXT: vpermi2w %ymm8, %ymm11, %ymm9 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512DQ-BW-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13] +; AVX512DQ-BW-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm2 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31] ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] -; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512DQ-BW-NEXT: vpermi2w %zmm0, %zmm2, %zmm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -3283,93 +3266,88 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm2 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm5 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %ymm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm4 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm2, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm7[0],ymm4[1],ymm7[1],ymm4[2],ymm7[2],ymm4[3],ymm7[3],ymm4[4],ymm7[4],ymm4[5],ymm7[5],ymm4[6],ymm7[6],ymm4[7],ymm7[7],ymm4[16],ymm7[16],ymm4[17],ymm7[17],ymm4[18],ymm7[18],ymm4[19],ymm7[19],ymm4[20],ymm7[20],ymm4[21],ymm7[21],ymm4[22],ymm7[22],ymm4[23],ymm7[23] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm5[8],ymm6[8],ymm5[9],ymm6[9],ymm5[10],ymm6[10],ymm5[11],ymm6[11],ymm5[12],ymm6[12],ymm5[13],ymm6[13],ymm5[14],ymm6[14],ymm5[15],ymm6[15],ymm5[24],ymm6[24],ymm5[25],ymm6[25],ymm5[26],ymm6[26],ymm5[27],ymm6[27],ymm5[28],ymm6[28],ymm5[29],ymm6[29],ymm5[30],ymm6[30],ymm5[31],ymm6[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm8 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm7, %ymm8, %ymm7 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm4, %zmm7 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm8 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm6, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm8, %ymm5, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm9[0],ymm4[0],ymm9[1],ymm4[1],ymm9[2],ymm4[2],ymm9[3],ymm4[3],ymm9[4],ymm4[4],ymm9[5],ymm4[5],ymm9[6],ymm4[6],ymm9[7],ymm4[7],ymm9[16],ymm4[16],ymm9[17],ymm4[17],ymm9[18],ymm4[18],ymm9[19],ymm4[19],ymm9[20],ymm4[20],ymm9[21],ymm4[21],ymm9[22],ymm4[22],ymm9[23],ymm4[23] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm9 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm9, %ymm10, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm9, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: movl $1227114788, %r10d # imm = 0x49244924 -; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm1, %ymm9 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm0, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [10,13,12,11,10,13,12,11,10,13,12,11,14,13,14,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 -; AVX512DQ-BW-FCP-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm3, %ymm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm6, %ymm2, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm6[0],ymm7[0],ymm6[1],ymm7[1],ymm6[2],ymm7[2],ymm6[3],ymm7[3],ymm6[4],ymm7[4],ymm6[5],ymm7[5],ymm6[6],ymm7[6],ymm6[7],ymm7[7],ymm6[16],ymm7[16],ymm6[17],ymm7[17],ymm6[18],ymm7[18],ymm6[19],ymm7[19],ymm6[20],ymm7[20],ymm6[21],ymm7[21],ymm6[22],ymm7[22],ymm6[23],ymm7[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm7 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm5, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm7, %ymm4, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[4],ymm8[4],ymm9[5],ymm8[5],ymm9[6],ymm8[6],ymm9[7],ymm8[7],ymm9[16],ymm8[16],ymm9[17],ymm8[17],ymm9[18],ymm8[18],ymm9[19],ymm8[19],ymm9[20],ymm8[20],ymm9[21],ymm8[21],ymm9[22],ymm8[22],ymm9[23],ymm8[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 ; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm9, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm8 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm1, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm0, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm6 = ymm10[0],ymm6[0],ymm10[1],ymm6[1],ymm10[2],ymm6[2],ymm10[3],ymm6[3],ymm10[4],ymm6[4],ymm10[5],ymm6[5],ymm10[6],ymm6[6],ymm10[7],ymm6[7],ymm10[16],ymm6[16],ymm10[17],ymm6[17],ymm10[18],ymm6[18],ymm10[19],ymm6[19],ymm10[20],ymm6[20],ymm10[21],ymm6[21],ymm10[22],ymm6[22],ymm10[23],ymm6[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: movw $9362, %r10w # imm = 0x2492 +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm6, %ymm8 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15],ymm4[24],ymm5[24],ymm4[25],ymm5[25],ymm4[26],ymm5[26],ymm4[27],ymm5[27],ymm4[28],ymm5[28],ymm4[29],ymm5[29],ymm4[30],ymm5[30],ymm4[31],ymm5[31] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15],ymm2[24],ymm3[24],ymm2[25],ymm3[25],ymm2[26],ymm3[26],ymm2[27],ymm3[27],ymm2[28],ymm3[28],ymm2[29],ymm3[29],ymm2[30],ymm3[30],ymm2[31],ymm3[31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,11,27,0,12,28,0,13,29,0,14,30,0,15,31,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm6, %ymm10, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm11, %zmm0, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm10 = [42,17,18,43,20,21,44,23,24,45,26,27,46,29,30,47] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm11 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm11, %zmm6, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm10, %zmm8, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm8 ; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm10 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm9, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm8, %xmm11 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm12 ; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm10, %xmm12, %xmm10 ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm10[8],xmm11[8],xmm10[9],xmm11[9],xmm10[10],xmm11[10],xmm10[11],xmm11[11],xmm10[12],xmm11[12],xmm10[13],xmm11[13],xmm10[14],xmm11[14],xmm10[15],xmm11[15] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,0,1] ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm15 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm10, %zmm14, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm11, %xmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm8, %xmm13, %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm12[0],xmm9[0],xmm12[1],xmm9[1],xmm12[2],xmm9[2],xmm12[3],xmm9[3],xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm8, %zmm10, %zmm14 -; AVX512DQ-BW-FCP-NEXT: movl $1227105426, %ecx # imm = 0x49242492 -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm15, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm8, %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm11, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm14, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm13[0],xmm7[1],xmm13[1],xmm7[2],xmm13[2],xmm7[3],xmm13[3],xmm7[4],xmm13[4],xmm7[5],xmm13[5],xmm7[6],xmm13[6],xmm7[7],xmm13[7] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm7 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm10, %xmm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm15 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm15, %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm16 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm7, %zmm10, %zmm16 -; AVX512DQ-BW-FCP-NEXT: movl $613566756, %ecx # imm = 0x24924924 -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm14 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm5[0],ymm6[0],ymm5[1],ymm6[1],ymm5[2],ymm6[2],ymm5[3],ymm6[3],ymm5[4],ymm6[4],ymm5[5],ymm6[5],ymm5[6],ymm6[6],ymm5[7],ymm6[7],ymm5[16],ymm6[16],ymm5[17],ymm6[17],ymm5[18],ymm6[18],ymm5[19],ymm6[19],ymm5[20],ymm6[20],ymm5[21],ymm6[21],ymm5[22],ymm6[22],ymm5[23],ymm6[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm7 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm5, %zmm6, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm9, %xmm15, %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm9, %ymm7 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm14[0],xmm11[0],xmm14[1],xmm11[1],xmm14[2],xmm11[2],xmm14[3],xmm11[3],xmm14[4],xmm11[4],xmm14[5],xmm11[5],xmm14[6],xmm11[6],xmm14[7],xmm11[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm16 = [0,16,0,1,17,0,2,18,0,3,19,0,4,20,0,5] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm9, %ymm13, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm10[0],xmm15[1],xmm10[1],xmm15[2],xmm10[2],xmm15[3],xmm10[3],xmm15[4],xmm10[4],xmm15[5],xmm10[5],xmm15[6],xmm10[6],xmm15[7],xmm10[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm9, %ymm16, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm13, %zmm7 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm14[8],xmm11[8],xmm14[9],xmm11[9],xmm14[10],xmm11[10],xmm14[11],xmm11[11],xmm14[12],xmm11[12],xmm14[13],xmm11[13],xmm14[14],xmm11[14],xmm14[15],xmm11[15] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm12[8],xmm8[8],xmm12[9],xmm8[9],xmm12[10],xmm8[10],xmm12[11],xmm8[11],xmm12[12],xmm8[12],xmm12[13],xmm8[13],xmm12[14],xmm8[14],xmm12[15],xmm8[15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [0,3,19,0,4,20,0,5,21,0,6,22,0,7,23,0] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm9, %ymm8, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm8 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm9 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm8, %ymm11, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[16],ymm5[16],ymm4[17],ymm5[17],ymm4[18],ymm5[18],ymm4[19],ymm5[19],ymm4[20],ymm5[20],ymm4[21],ymm5[21],ymm4[22],ymm5[22],ymm4[23],ymm5[23] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm2, %zmm3, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm7, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [8,24,0,9,25,0,10,26,0,11,27,0,12,28,0,13] +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm4, %ymm2, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm3 = [16,17,40,19,20,41,22,23,42,25,26,43,28,29,44,31] ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm2 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,42,41,40,43,42,41,40,43,42,41,40,43,44,44,44,44] -; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm1, %zmm2 -; AVX512DQ-BW-FCP-NEXT: movl $1227133513, %ecx # imm = 0x49249249 -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm14, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vpermi2w %zmm0, %zmm2, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 128(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <32 x i8>, ptr %in.vecptr0, align 64 @@ -6368,726 +6346,770 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-LABEL: store_i8_stride6_vf64: ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512BW-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-NEXT: vpshufb %ymm7, %ymm1, %ymm0 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512BW-NEXT: vpshufb %ymm7, %ymm2, %ymm3 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-NEXT: vpermw %ymm3, %ymm8, %ymm3 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 -; AVX512BW-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-NEXT: vpshufb %ymm12, %ymm3, %ymm0 -; AVX512BW-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512BW-NEXT: vpshufb %ymm12, %ymm4, %ymm10 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[4],ymm0[4],ymm10[5],ymm0[5],ymm10[6],ymm0[6],ymm10[7],ymm0[7],ymm10[16],ymm0[16],ymm10[17],ymm0[17],ymm10[18],ymm0[18],ymm10[19],ymm0[19],ymm10[20],ymm0[20],ymm10[21],ymm0[21],ymm10[22],ymm0[22],ymm10[23],ymm0[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512BW-NEXT: movl $613566756, %r10d # imm = 0x24924924 -; AVX512BW-NEXT: kmovd %r10d, %k1 -; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[4,5,6,7,4,5,6,7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] -; AVX512BW-NEXT: vpshufb %zmm14, %zmm5, %zmm5 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512BW-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512BW-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-NEXT: vpshufb %xmm5, %xmm4, %xmm0 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm18 +; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512BW-NEXT: vpshufb %xmm5, %xmm6, %xmm3 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm3 +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-NEXT: vmovdqa64 (%rcx), %xmm16 +; AVX512BW-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512BW-NEXT: vpshufb %xmm0, %xmm10, %xmm8 +; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512BW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512BW-NEXT: vpshufb %xmm0, %xmm11, %xmm9 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512BW-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] ; AVX512BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 +; AVX512BW-NEXT: kmovd %r10d, %k1 +; AVX512BW-NEXT: vmovdqu16 %zmm8, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX512BW-NEXT: vpermt2w %ymm8, %ymm13, %ymm12 +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[2,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm3 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm3, %zmm12, %zmm3 +; AVX512BW-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm12[2,1,2,3] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm19[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm20, %zmm14 +; AVX512BW-NEXT: movabsq $585610922974906400, %r10 # imm = 0x820820820820820 +; AVX512BW-NEXT: kmovq %r10, %k2 +; AVX512BW-NEXT: vmovdqu8 %zmm14, %zmm3 {%k2} +; AVX512BW-NEXT: vpshufb %xmm5, %xmm15, %xmm14 +; AVX512BW-NEXT: vpshufb %xmm5, %xmm18, %xmm5 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] +; AVX512BW-NEXT: vpermt2w %zmm5, %zmm7, %zmm14 +; AVX512BW-NEXT: vpshufb %xmm0, %xmm16, %xmm5 +; AVX512BW-NEXT: vpshufb %xmm0, %xmm17, %xmm7 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512BW-NEXT: vprold $16, %xmm7, %xmm7 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,0,1,4,4,4,5] +; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k1} +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX512BW-NEXT: vpermi2w %ymm7, %ymm14, %ymm13 +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,2,3] +; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX512BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm14 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm7 +; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,1,2,3] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm20, %zmm13 +; AVX512BW-NEXT: vmovdqu8 %zmm13, %zmm7 {%k2} +; AVX512BW-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512BW-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm21, %zmm20 +; AVX512BW-NEXT: vmovdqa64 (%rsi), %ymm16 +; AVX512BW-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm18, %zmm15 +; AVX512BW-NEXT: movl $613566756, %r10d # imm = 0x24924924 ; AVX512BW-NEXT: kmovd %r10d, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k2} -; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[4,5,6,7,4,5,6,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7] -; AVX512BW-NEXT: movabsq $-9076969306111049208, %r10 # imm = 0x8208208208208208 -; AVX512BW-NEXT: kmovq %r10, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k3} -; AVX512BW-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512BW-NEXT: vpshufb %ymm7, %ymm9, %ymm5 -; AVX512BW-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512BW-NEXT: vpshufb %ymm7, %ymm10, %ymm6 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] -; AVX512BW-NEXT: vpermw %ymm6, %ymm8, %ymm6 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512BW-NEXT: vpshufb %ymm12, %ymm17, %ymm5 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %ymm19 -; AVX512BW-NEXT: vpshufb %ymm12, %ymm19, %ymm7 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[16],ymm5[16],ymm7[17],ymm5[17],ymm7[18],ymm5[18],ymm7[19],ymm5[19],ymm7[20],ymm5[20],ymm7[21],ymm5[21],ymm7[22],ymm5[22],ymm7[23],ymm5[23] -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm19[8],ymm17[8],ymm19[9],ymm17[9],ymm19[10],ymm17[10],ymm19[11],ymm17[11],ymm19[12],ymm17[12],ymm19[13],ymm17[13],ymm19[14],ymm17[14],ymm19[15],ymm17[15],ymm19[24],ymm17[24],ymm19[25],ymm17[25],ymm19[26],ymm17[26],ymm19[27],ymm17[27],ymm19[28],ymm17[28],ymm19[29],ymm17[29],ymm19[30],ymm17[30],ymm19[31],ymm17[31] -; AVX512BW-NEXT: vpermw %ymm7, %ymm11, %ymm7 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm5 {%k1} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb %zmm14, %zmm13, %zmm6 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512BW-NEXT: vmovdqu16 %zmm6, %zmm5 {%k2} -; AVX512BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm6 = zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k3} -; AVX512BW-NEXT: vmovdqa64 (%rsi), %xmm21 -; AVX512BW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512BW-NEXT: vpbroadcastq {{.*#+}} xmm20 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-NEXT: vpshufb %xmm20, %xmm7, %xmm6 -; AVX512BW-NEXT: vmovdqa64 (%rdi), %xmm22 -; AVX512BW-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512BW-NEXT: vpshufb %xmm20, %xmm8, %xmm14 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-NEXT: vpermt2w %zmm14, %zmm25, %zmm6 -; AVX512BW-NEXT: vmovdqa64 (%rcx), %xmm23 -; AVX512BW-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm15, %xmm14 -; AVX512BW-NEXT: vmovdqa64 (%rdx), %xmm24 -; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm18 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm18, %xmm16 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm16[0],xmm14[0],xmm16[1],xmm14[1],xmm16[2],xmm14[2],xmm16[3],xmm14[3],xmm16[4],xmm14[4],xmm16[5],xmm14[5],xmm16[6],xmm14[6],xmm16[7],xmm14[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] -; AVX512BW-NEXT: vprold $16, %xmm16, %xmm16 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm14, %zmm16, %zmm14 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[0,0,0,1,4,4,4,5] -; AVX512BW-NEXT: vmovdqu16 %zmm14, %zmm6 {%k2} -; AVX512BW-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm16 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm26 = xmm14[2,1,2,3] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm26 = xmm26[0],zero,xmm26[1],zero,xmm26[2],zero,xmm26[3],zero,xmm26[4],zero,xmm26[5],zero,xmm26[6],zero,xmm26[7],zero -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm27 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm27, %zmm16 -; AVX512BW-NEXT: vmovdqu16 %zmm16, %zmm6 {%k1} -; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm16 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm16[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm28 = xmm16[2,1,2,3] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm28 = xmm28[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpermt2w %zmm28, %zmm27, %zmm26 -; AVX512BW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 -; AVX512BW-NEXT: kmovq %rcx, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm26, %zmm6 {%k3} -; AVX512BW-NEXT: vpshufb %xmm20, %xmm21, %xmm26 -; AVX512BW-NEXT: vpshufb %xmm20, %xmm22, %xmm20 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm26 = xmm20[8],xmm26[8],xmm20[9],xmm26[9],xmm20[10],xmm26[10],xmm20[11],xmm26[11],xmm20[12],xmm26[12],xmm20[13],xmm26[13],xmm20[14],xmm26[14],xmm20[15],xmm26[15] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] -; AVX512BW-NEXT: vpermt2w %zmm26, %zmm25, %zmm20 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm23, %xmm25 -; AVX512BW-NEXT: vpshufb %xmm12, %xmm24, %xmm12 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm25[0],xmm12[1],xmm25[1],xmm12[2],xmm25[2],xmm12[3],xmm25[3],xmm12[4],xmm25[4],xmm12[5],xmm25[5],xmm12[6],xmm25[6],xmm12[7],xmm25[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7] -; AVX512BW-NEXT: vprold $16, %xmm25, %xmm25 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm25, %zmm12 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,0,0,1,4,4,4,5] -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm20 {%k2} -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm12 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm13[2,1,2,3] -; AVX512BW-NEXT: vpmovzxbw {{.*#+}} xmm25 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm27, %zmm12 -; AVX512BW-NEXT: vmovdqu16 %zmm12, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm11[2,1,2,3] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm25[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-NEXT: vpermt2w %zmm25, %zmm27, %zmm12 -; AVX512BW-NEXT: vmovdqu8 %zmm12, %zmm20 {%k3} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm19[0],ymm17[0],ymm19[1],ymm17[1],ymm19[2],ymm17[2],ymm19[3],ymm17[3],ymm19[4],ymm17[4],ymm19[5],ymm17[5],ymm19[6],ymm17[6],ymm19[7],ymm17[7],ymm19[16],ymm17[16],ymm19[17],ymm17[17],ymm19[18],ymm17[18],ymm19[19],ymm17[19],ymm19[20],ymm17[20],ymm19[21],ymm17[21],ymm19[22],ymm17[22],ymm19[23],ymm17[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm17 +; AVX512BW-NEXT: vmovdqu16 %zmm20, %zmm15 {%k2} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm15, %ymm19 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512BW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpshufb %ymm20, %ymm5, %ymm22 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512BW-NEXT: movw $18724, %r10w # imm = 0x4924 +; AVX512BW-NEXT: kmovd %r10d, %k3 +; AVX512BW-NEXT: vmovdqu16 %ymm22, %ymm19 {%k3} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512BW-NEXT: vpermt2w %ymm22, %ymm23, %ymm15 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm15 +; AVX512BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512BW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-NEXT: vpshufb %ymm19, %ymm8, %ymm22 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] +; AVX512BW-NEXT: vpermt2w %zmm22, %zmm25, %zmm24 +; AVX512BW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 +; AVX512BW-NEXT: kmovq %r10, %k4 +; AVX512BW-NEXT: vmovdqu8 %zmm24, %zmm15 {%k4} +; AVX512BW-NEXT: vmovdqa64 32(%rcx), %ymm22 +; AVX512BW-NEXT: vmovdqa64 32(%rdx), %ymm24 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm21, %zmm10 +; AVX512BW-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512BW-NEXT: vmovdqa64 32(%rdi), %ymm21 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm11[0],ymm21[1],ymm11[1],ymm21[2],ymm11[2],ymm21[3],ymm11[3],ymm21[4],ymm11[4],ymm21[5],ymm11[5],ymm21[6],ymm11[6],ymm21[7],ymm11[7],ymm21[16],ymm11[16],ymm21[17],ymm11[17],ymm21[18],ymm11[18],ymm21[19],ymm11[19],ymm21[20],ymm11[20],ymm21[21],ymm11[21],ymm21[22],ymm11[22],ymm21[23],ymm11[23] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm18, %zmm4 +; AVX512BW-NEXT: vmovdqu16 %zmm10, %zmm4 {%k2} +; AVX512BW-NEXT: vextracti64x4 $1, %zmm4, %ymm6 +; AVX512BW-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512BW-NEXT: vpshufb %ymm20, %ymm10, %ymm10 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512BW-NEXT: vmovdqu16 %ymm10, %ymm6 {%k3} +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpermt2w %ymm9, %ymm23, %ymm4 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512BW-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX512BW-NEXT: vpshufb %ymm19, %ymm6, %ymm6 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-NEXT: vpermt2w %zmm6, %zmm25, %zmm9 +; AVX512BW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k4} +; AVX512BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-NEXT: vpshufb %ymm6, %ymm11, %ymm9 +; AVX512BW-NEXT: vpshufb %ymm6, %ymm21, %ymm10 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512BW-NEXT: vpermt2w %zmm9, %zmm12, %zmm10 -; AVX512BW-NEXT: vmovdqu16 %zmm17, %zmm10 {%k1} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufb %ymm9, %ymm13, %ymm17 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] -; AVX512BW-NEXT: vpermt2w %zmm17, %zmm21, %zmm13 -; AVX512BW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 -; AVX512BW-NEXT: kmovd %ecx, %k2 -; AVX512BW-NEXT: vmovdqu16 %zmm13, %zmm10 {%k2} -; AVX512BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512BW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512BW-NEXT: vpshufb %ymm13, %ymm11, %ymm17 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpermt2w %zmm17, %zmm21, %zmm11 -; AVX512BW-NEXT: movabsq $2342443691899625602, %rcx # imm = 0x2082082082082082 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm21[8],ymm11[8],ymm21[9],ymm11[9],ymm21[10],ymm11[10],ymm21[11],ymm11[11],ymm21[12],ymm11[12],ymm21[13],ymm11[13],ymm21[14],ymm11[14],ymm21[15],ymm11[15],ymm21[24],ymm11[24],ymm21[25],ymm11[25],ymm21[26],ymm11[26],ymm21[27],ymm11[27],ymm21[28],ymm11[28],ymm21[29],ymm11[29],ymm21[30],ymm11[30],ymm21[31],ymm11[31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512BW-NEXT: vpshufb %ymm0, %ymm22, %ymm10 +; AVX512BW-NEXT: vpshufb %ymm0, %ymm24, %ymm12 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-NEXT: vpermw %ymm12, %ymm18, %ymm12 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512BW-NEXT: vmovdqu16 %zmm9, %zmm10 {%k2} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] +; AVX512BW-NEXT: vpshufb %zmm9, %zmm2, %zmm2 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: vmovdqu16 %zmm2, %zmm10 {%k1} +; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 ; AVX512BW-NEXT: kmovq %rcx, %k3 -; AVX512BW-NEXT: vmovdqu8 %zmm11, %zmm10 {%k3} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] -; AVX512BW-NEXT: vpermt2w %zmm3, %zmm19, %zmm4 +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm10 {%k3} +; AVX512BW-NEXT: vpshufb %ymm6, %ymm16, %ymm1 +; AVX512BW-NEXT: vpshufb %ymm6, %ymm17, %ymm2 ; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm2 -; AVX512BW-NEXT: vmovdqu16 %zmm4, %zmm2 {%k1} -; AVX512BW-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512BW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm3 -; AVX512BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k2} -; AVX512BW-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512BW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm3 -; AVX512BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k3} -; AVX512BW-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] +; AVX512BW-NEXT: vpermw %ymm2, %ymm11, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpshufb %ymm0, %ymm13, %ymm2 +; AVX512BW-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; AVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] +; AVX512BW-NEXT: vpermw %ymm2, %ymm18, %ymm2 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} +; AVX512BW-NEXT: vpshufb %zmm9, %zmm5, %zmm1 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3} +; AVX512BW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm15, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; ; AVX512BW-FCP-LABEL: store_i8_stride6_vf64: ; AVX512BW-FCP: # %bb.0: -; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm2 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-FCP-NEXT: vpermw %ymm3, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vmovdqa64 (%rcx), %xmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm16 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm24 -; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm26 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm19 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm16, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm20, %xmm13 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm16[0],xmm20[1],xmm16[1],xmm20[2],xmm16[2],xmm20[3],xmm16[3],xmm20[4],xmm16[4],xmm20[5],xmm16[5],xmm20[6],xmm16[6],xmm20[7],xmm16[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-FCP-NEXT: vpermt2w %zmm12, %zmm18, %zmm13 -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm23 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm15, %xmm12 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm19, %xmm14 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512BW-FCP-NEXT: vpermt2w %zmm14, %zmm25, %zmm12 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm28 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm27 = [8,9,0,0,0,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm28, %xmm17 -; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm14 = xmm28[0],zero,xmm28[1],zero,xmm28[2],zero,xmm28[3],zero,xmm28[4],zero,xmm28[5],zero,xmm28[6],zero,xmm28[7],zero -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm30 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] -; AVX512BW-FCP-NEXT: vpermt2w %zmm17, %zmm30, %zmm14 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm29 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm21, %xmm17 -; AVX512BW-FCP-NEXT: vpshufb %xmm2, %xmm22, %xmm31 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm31 = xmm31[0],xmm17[0],xmm31[1],xmm17[1],xmm31[2],xmm17[2],xmm31[3],xmm17[3],xmm31[4],xmm17[4],xmm31[5],xmm17[5],xmm31[6],xmm17[6],xmm31[7],xmm17[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm18, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm31 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7] -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm24, %xmm18 -; AVX512BW-FCP-NEXT: vpshufb %xmm23, %xmm26, %xmm23 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7] -; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm25, %zmm18 -; AVX512BW-FCP-NEXT: vpshufb %xmm31, %xmm29, %xmm25 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm29[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm30, %zmm23 -; AVX512BW-FCP-NEXT: vpshufb %xmm27, %xmm8, %xmm27 -; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm25 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero -; AVX512BW-FCP-NEXT: vpermt2w %zmm27, %zmm30, %zmm25 -; AVX512BW-FCP-NEXT: vpshufb %xmm31, %xmm9, %xmm31 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm30, %zmm27 -; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm30 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm31 = ymm30[0],ymm7[0],ymm30[1],ymm7[1],ymm30[2],ymm7[2],ymm30[3],ymm7[3],ymm30[4],ymm7[4],ymm30[5],ymm7[5],ymm30[6],ymm7[6],ymm30[7],ymm7[7],ymm30[16],ymm7[16],ymm30[17],ymm7[17],ymm30[18],ymm7[18],ymm30[19],ymm7[19],ymm30[20],ymm7[20],ymm30[21],ymm7[21],ymm30[22],ymm7[22],ymm30[23],ymm7[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm0, %zmm22 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm31 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm24, %zmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %ymm26 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm20[8],xmm16[8],xmm20[9],xmm16[9],xmm20[10],xmm16[10],xmm20[11],xmm16[11],xmm20[12],xmm16[12],xmm20[13],xmm16[13],xmm20[14],xmm16[14],xmm20[15],xmm16[15] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm4[0],ymm26[1],ymm4[1],ymm26[2],ymm4[2],ymm26[3],ymm4[3],ymm26[4],ymm4[4],ymm26[5],ymm4[5],ymm26[6],ymm4[6],ymm26[7],ymm4[7],ymm26[16],ymm4[16],ymm26[17],ymm4[17],ymm26[18],ymm4[18],ymm26[19],ymm4[19],ymm26[20],ymm4[20],ymm26[21],ymm4[21],ymm26[22],ymm4[22],ymm26[23],ymm4[23] -; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm0, %zmm16 -; AVX512BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm31 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] -; AVX512BW-FCP-NEXT: vpermt2w %zmm31, %zmm0, %zmm20 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm19[8],xmm15[8],xmm19[9],xmm15[9],xmm19[10],xmm15[10],xmm19[11],xmm15[11],xmm19[12],xmm15[12],xmm19[13],xmm15[13],xmm19[14],xmm15[14],xmm19[15],xmm15[15] +; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm16 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm4 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm17 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm3 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm18 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm11 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm13, %zmm12 +; AVX512BW-FCP-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 +; AVX512BW-FCP-NEXT: kmovd %r10d, %k1 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm12 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512BW-FCP-NEXT: vmovdqa %ymm12, %ymm4 +; AVX512BW-FCP-NEXT: vpermt2w %ymm3, %ymm14, %ymm4 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm19 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm20 = [8,9,0,0,0,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm11, %xmm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm19, %zmm12 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm3 +; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm12, %xmm4 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm23 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] +; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm23, %zmm22 +; AVX512BW-FCP-NEXT: movabsq $585610922974906400, %r10 # imm = 0x820820820820820 +; AVX512BW-FCP-NEXT: kmovq %r10, %k2 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm3 {%k2} +; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm4 +; AVX512BW-FCP-NEXT: vpshufb %xmm0, %xmm16, %xmm22 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm22[0],xmm4[0],xmm22[1],xmm4[1],xmm22[2],xmm4[2],xmm22[3],xmm4[3],xmm22[4],xmm4[4],xmm22[5],xmm4[5],xmm22[6],xmm4[6],xmm22[7],xmm4[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm5, %zmm22 +; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm17, %xmm4 +; AVX512BW-FCP-NEXT: vpshufb %xmm7, %xmm18, %xmm5 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm13, %zmm5 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm5 {%k1} +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX512BW-FCP-NEXT: vpermi2w %ymm7, %ymm5, %ymm14 +; AVX512BW-FCP-NEXT: vpshufb %xmm20, %xmm4, %xmm7 +; AVX512BW-FCP-NEXT: vpermt2w %zmm7, %zmm19, %zmm5 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 +; AVX512BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpshufb %xmm21, %xmm7, %xmm13 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm23, %zmm14 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm5 {%k2} +; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] +; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm21, %zmm20 +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm15 +; AVX512BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm16[0],ymm15[0],ymm16[1],ymm15[1],ymm16[2],ymm15[2],ymm16[3],ymm15[3],ymm16[4],ymm15[4],ymm16[5],ymm15[5],ymm16[6],ymm15[6],ymm16[7],ymm15[7],ymm16[16],ymm15[16],ymm16[17],ymm15[17],ymm16[18],ymm15[18],ymm16[19],ymm15[19],ymm16[20],ymm15[20],ymm16[21],ymm15[21],ymm16[22],ymm15[22],ymm16[23],ymm15[23] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512BW-FCP-NEXT: vpermt2w %zmm19, %zmm18, %zmm17 +; AVX512BW-FCP-NEXT: movl $613566756, %r10d # imm = 0x24924924 +; AVX512BW-FCP-NEXT: kmovd %r10d, %k2 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm17 {%k2} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm17, %ymm19 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512BW-FCP-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm4, %ymm22 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 +; AVX512BW-FCP-NEXT: kmovd %r10d, %k3 +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm22, %ymm19 {%k3} +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512BW-FCP-NEXT: vpermt2w %ymm22, %ymm23, %ymm17 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm17 ; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] ; AVX512BW-FCP-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm24, %zmm10 -; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm11 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FCP-NEXT: vpermt2w %zmm11, %zmm0, %zmm15 -; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX512BW-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm1 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm28[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm11 -; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm1, %ymm1 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm29[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm19 -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm26, %ymm1 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm26[8],ymm4[8],ymm26[9],ymm4[9],ymm26[10],ymm4[10],ymm26[11],ymm4[11],ymm26[12],ymm4[12],ymm26[13],ymm4[13],ymm26[14],ymm4[14],ymm26[15],ymm4[15],ymm26[24],ymm4[24],ymm26[25],ymm4[25],ymm26[26],ymm4[26],ymm26[27],ymm4[27],ymm26[28],ymm4[28],ymm26[29],ymm4[29],ymm26[30],ymm4[30],ymm26[31],ymm4[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm24 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512BW-FCP-NEXT: vpermw %ymm1, %ymm24, %ymm1 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 -; AVX512BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512BW-FCP-NEXT: movl $613566756, %eax # imm = 0x24924924 -; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] -; AVX512BW-FCP-NEXT: vpshufb %zmm1, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512BW-FCP-NEXT: movl $-1840700270, %eax # imm = 0x92492492 -; AVX512BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm4 {%k2} -; AVX512BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7] -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512BW-FCP-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208 -; AVX512BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm4 {%k3} -; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm3 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512BW-FCP-NEXT: vpermw %ymm3, %ymm5, %ymm3 +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm22 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] +; AVX512BW-FCP-NEXT: vpermt2w %zmm22, %zmm25, %zmm24 +; AVX512BW-FCP-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 +; AVX512BW-FCP-NEXT: kmovq %r10, %k4 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm24, %zmm17 {%k4} +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm22 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %ymm24 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm21, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm21 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm8[0],ymm21[1],ymm8[1],ymm21[2],ymm8[2],ymm21[3],ymm8[3],ymm21[4],ymm8[4],ymm21[5],ymm8[5],ymm21[6],ymm8[6],ymm21[7],ymm8[7],ymm21[16],ymm8[16],ymm21[17],ymm8[17],ymm21[18],ymm8[18],ymm21[19],ymm8[19],ymm21[20],ymm8[20],ymm21[21],ymm8[21],ymm21[22],ymm8[22],ymm21[23],ymm8[23] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm26, %zmm18, %zmm9 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm9 {%k2} +; AVX512BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm6 +; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm20, %ymm10, %ymm10 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm6 {%k3} +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FCP-NEXT: vpermt2w %ymm10, %ymm23, %ymm9 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 +; AVX512BW-FCP-NEXT: vmovdqa 32(%r9), %ymm9 +; AVX512BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm9 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm9, %zmm25, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm6 {%k4} +; AVX512BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm21, %ymm11 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm21[8],ymm8[8],ymm21[9],ymm8[9],ymm21[10],ymm8[10],ymm21[11],ymm8[11],ymm21[12],ymm8[12],ymm21[13],ymm8[13],ymm21[14],ymm8[14],ymm21[15],ymm8[15],ymm21[24],ymm8[24],ymm21[25],ymm8[25],ymm21[26],ymm8[26],ymm21[27],ymm8[27],ymm21[28],ymm8[28],ymm21[29],ymm8[29],ymm21[30],ymm8[30],ymm21[31],ymm8[31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512BW-FCP-NEXT: vpermw %ymm8, %ymm11, %ymm8 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 +; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm22, %ymm10 +; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm24, %ymm12 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512BW-FCP-NEXT: vpermw %ymm12, %ymm18, %ymm12 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k2} +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] +; AVX512BW-FCP-NEXT: vpshufb %zmm8, %zmm2, %zmm2 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm10 {%k1} +; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-FCP-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 +; AVX512BW-FCP-NEXT: kmovq %rcx, %k3 +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm10 {%k3} +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm1 +; AVX512BW-FCP-NEXT: vpshufb %ymm9, %ymm16, %ymm2 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm16[8],ymm15[8],ymm16[9],ymm15[9],ymm16[10],ymm15[10],ymm16[11],ymm15[11],ymm16[12],ymm15[12],ymm16[13],ymm15[13],ymm16[14],ymm15[14],ymm16[15],ymm15[15],ymm16[24],ymm15[24],ymm16[25],ymm15[25],ymm16[26],ymm15[26],ymm16[27],ymm15[27],ymm16[28],ymm15[28],ymm16[29],ymm15[29],ymm16[30],ymm15[30],ymm16[31],ymm15[31] +; AVX512BW-FCP-NEXT: vpermw %ymm2, %ymm11, %ymm2 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm2 +; AVX512BW-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] ; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm3 -; AVX512BW-FCP-NEXT: vpshufb %ymm2, %ymm30, %ymm2 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm30[8],ymm7[8],ymm30[9],ymm7[9],ymm30[10],ymm7[10],ymm30[11],ymm7[11],ymm30[12],ymm7[12],ymm30[13],ymm7[13],ymm30[14],ymm7[14],ymm30[15],ymm7[15],ymm30[24],ymm7[24],ymm30[25],ymm7[25],ymm30[26],ymm7[26],ymm30[27],ymm7[27],ymm30[28],ymm7[28],ymm30[29],ymm7[29],ymm30[30],ymm7[30],ymm30[31],ymm7[31] -; AVX512BW-FCP-NEXT: vpermw %ymm3, %ymm24, %ymm3 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpshufb %zmm1, %zmm8, %zmm0 -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm2 {%k2} -; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm2 {%k3} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm21 {%k1} -; AVX512BW-FCP-NEXT: movl $1227133513, %eax # imm = 0x49249249 -; AVX512BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm21 {%k3} -; AVX512BW-FCP-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 -; AVX512BW-FCP-NEXT: kmovq %rax, %k4 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm21 {%k4} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm12 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} -; AVX512BW-FCP-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 -; AVX512BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm23, %zmm12 {%k5} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm18 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm27, %zmm18 {%k5} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm10 {%k1} -; AVX512BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k3} -; AVX512BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm10 {%k4} -; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] +; AVX512BW-FCP-NEXT: vpermw %ymm2, %ymm18, %ymm2 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} +; AVX512BW-FCP-NEXT: vpshufb %zmm8, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3} +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; ; AVX512DQ-BW-LABEL: store_i8_stride6_vf64: ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %ymm1 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm1, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm2, %ymm3 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15],ymm2[24],ymm1[24],ymm2[25],ymm1[25],ymm2[26],ymm1[26],ymm2[27],ymm1[27],ymm2[28],ymm1[28],ymm2[29],ymm1[29],ymm2[30],ymm1[30],ymm2[31],ymm1[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQ-BW-NEXT: vpermw %ymm3, %ymm8, %ymm3 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm9 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %ymm3 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm12 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-BW-NEXT: vpshufb %ymm12, %ymm3, %ymm0 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %ymm4 -; AVX512DQ-BW-NEXT: vpshufb %ymm12, %ymm4, %ymm10 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm10[0],ymm0[0],ymm10[1],ymm0[1],ymm10[2],ymm0[2],ymm10[3],ymm0[3],ymm10[4],ymm0[4],ymm10[5],ymm0[5],ymm10[6],ymm0[6],ymm10[7],ymm0[7],ymm10[16],ymm0[16],ymm10[17],ymm0[17],ymm10[18],ymm0[18],ymm10[19],ymm0[19],ymm10[20],ymm0[20],ymm10[21],ymm0[21],ymm10[22],ymm0[22],ymm10[23],ymm0[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm4[8],ymm3[8],ymm4[9],ymm3[9],ymm4[10],ymm3[10],ymm4[11],ymm3[11],ymm4[12],ymm3[12],ymm4[13],ymm3[13],ymm4[14],ymm3[14],ymm4[15],ymm3[15],ymm4[24],ymm3[24],ymm4[25],ymm3[25],ymm4[26],ymm3[26],ymm4[27],ymm3[27],ymm4[28],ymm3[28],ymm4[29],ymm3[29],ymm4[30],ymm3[30],ymm4[31],ymm3[31] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512DQ-BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: movl $613566756, %r10d # imm = 0x24924924 -; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm14 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] -; AVX512DQ-BW-NEXT: vpshufb %zmm14, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-BW-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm15 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %xmm4 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} xmm5 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm4, %xmm0 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm18 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm6 +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm6, %xmm3 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-NEXT: vpermt2w %zmm0, %zmm7, %zmm3 +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %xmm16 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %xmm10 +; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm10, %xmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %xmm17 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rdx), %xmm11 +; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm11, %xmm9 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] +; AVX512DQ-BW-NEXT: vprold $16, %xmm9, %xmm9 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm8 = zmm8[0,0,0,1,4,4,4,5] ; AVX512DQ-BW-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 +; AVX512DQ-BW-NEXT: kmovd %r10d, %k1 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm8, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %xmm9 +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm8 = xmm9[0],zero,xmm9[1],zero,xmm9[2],zero,xmm9[3],zero,xmm9[4],zero,xmm9[5],zero,xmm9[6],zero,xmm9[7],zero +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm13 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-NEXT: vmovdqa %ymm3, %ymm12 +; AVX512DQ-BW-NEXT: vpermt2w %ymm8, %ymm13, %ymm12 +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm8 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm9[2,1,2,3] +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm14 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero +; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm8, %zmm3 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm3, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm19 = xmm12[2,1,2,3] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm19[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm20 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] +; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm20, %zmm14 +; AVX512DQ-BW-NEXT: movabsq $585610922974906400, %r10 # imm = 0x820820820820820 +; AVX512DQ-BW-NEXT: kmovq %r10, %k2 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm14, %zmm3 {%k2} +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm15, %xmm14 +; AVX512DQ-BW-NEXT: vpshufb %xmm5, %xmm18, %xmm5 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm5[8],xmm14[8],xmm5[9],xmm14[9],xmm5[10],xmm14[10],xmm5[11],xmm14[11],xmm5[12],xmm14[12],xmm5[13],xmm14[13],xmm5[14],xmm14[14],xmm5[15],xmm14[15] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm5, %zmm7, %zmm14 +; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm16, %xmm5 +; AVX512DQ-BW-NEXT: vpshufb %xmm0, %xmm17, %xmm7 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3],xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512DQ-BW-NEXT: vprold $16, %xmm7, %xmm7 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm7, %zmm5 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[0,0,0,1,4,4,4,5] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm5 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm5[0],zero,xmm5[1],zero,xmm5[2],zero,xmm5[3],zero,xmm5[4],zero,xmm5[5],zero,xmm5[6],zero,xmm5[7],zero +; AVX512DQ-BW-NEXT: vpermi2w %ymm7, %ymm14, %ymm13 +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm7 = xmm5[2,1,2,3] +; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm7[0],zero,xmm7[1],zero,xmm7[2],zero,xmm7[3],zero,xmm7[4],zero,xmm7[5],zero,xmm7[6],zero,xmm7[7],zero +; AVX512DQ-BW-NEXT: vpermt2w %zmm7, %zmm8, %zmm14 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm13, %zmm7 +; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm8[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm14 = xmm8[2,1,2,3] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm14[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm20, %zmm13 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm13, %zmm7 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] +; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm21, %zmm20 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %ymm16 +; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %ymm17 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm17[0],ymm16[0],ymm17[1],ymm16[1],ymm17[2],ymm16[2],ymm17[3],ymm16[3],ymm17[4],ymm16[4],ymm17[5],ymm16[5],ymm17[6],ymm16[6],ymm17[7],ymm16[7],ymm17[16],ymm16[16],ymm17[17],ymm16[17],ymm17[18],ymm16[18],ymm17[19],ymm16[19],ymm17[20],ymm16[20],ymm17[21],ymm16[21],ymm17[22],ymm16[22],ymm17[23],ymm16[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm18, %zmm15 +; AVX512DQ-BW-NEXT: movl $613566756, %r10d # imm = 0x24924924 ; AVX512DQ-BW-NEXT: kmovd %r10d, %k2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm5, %zmm0 {%k2} -; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm6[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm5 = zmm5[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm5 = zmm5[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-NEXT: movabsq $-9076969306111049208, %r10 # imm = 0x8208208208208208 -; AVX512DQ-BW-NEXT: kmovq %r10, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm5, %zmm0 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %ymm9 -; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm9, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %ymm10 -; AVX512DQ-BW-NEXT: vpshufb %ymm7, %ymm10, %ymm6 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm6 = ymm10[8],ymm9[8],ymm10[9],ymm9[9],ymm10[10],ymm9[10],ymm10[11],ymm9[11],ymm10[12],ymm9[12],ymm10[13],ymm9[13],ymm10[14],ymm9[14],ymm10[15],ymm9[15],ymm10[24],ymm9[24],ymm10[25],ymm9[25],ymm10[26],ymm9[26],ymm10[27],ymm9[27],ymm10[28],ymm9[28],ymm10[29],ymm9[29],ymm10[30],ymm9[30],ymm10[31],ymm9[31] -; AVX512DQ-BW-NEXT: vpermw %ymm6, %ymm8, %ymm6 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %ymm17 -; AVX512DQ-BW-NEXT: vpshufb %ymm12, %ymm17, %ymm5 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %ymm19 -; AVX512DQ-BW-NEXT: vpshufb %ymm12, %ymm19, %ymm7 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm5 = ymm7[0],ymm5[0],ymm7[1],ymm5[1],ymm7[2],ymm5[2],ymm7[3],ymm5[3],ymm7[4],ymm5[4],ymm7[5],ymm5[5],ymm7[6],ymm5[6],ymm7[7],ymm5[7],ymm7[16],ymm5[16],ymm7[17],ymm5[17],ymm7[18],ymm5[18],ymm7[19],ymm5[19],ymm7[20],ymm5[20],ymm7[21],ymm5[21],ymm7[22],ymm5[22],ymm7[23],ymm5[23] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm7 = ymm19[8],ymm17[8],ymm19[9],ymm17[9],ymm19[10],ymm17[10],ymm19[11],ymm17[11],ymm19[12],ymm17[12],ymm19[13],ymm17[13],ymm19[14],ymm17[14],ymm19[15],ymm17[15],ymm19[24],ymm17[24],ymm19[25],ymm17[25],ymm19[26],ymm17[26],ymm19[27],ymm17[27],ymm19[28],ymm17[28],ymm19[29],ymm17[29],ymm19[30],ymm17[30],ymm19[31],ymm17[31] -; AVX512DQ-BW-NEXT: vpermw %ymm7, %ymm11, %ymm7 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm5, %zmm5 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm5 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm13 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb %zmm14, %zmm13, %zmm6 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm6, %zmm5 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti64x4 {{.*#+}} zmm11 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm6 = zmm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm6 = zmm6[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 (%rsi), %xmm21 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %xmm7 -; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} xmm20 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm7, %xmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdi), %xmm22 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rdi), %xmm8 -; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm8, %xmm14 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm6[8],xmm14[9],xmm6[9],xmm14[10],xmm6[10],xmm14[11],xmm6[11],xmm14[12],xmm6[12],xmm14[13],xmm6[13],xmm14[14],xmm6[14],xmm14[15],xmm6[15] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm25, %zmm6 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rcx), %xmm23 -; AVX512DQ-BW-NEXT: vmovdqa 32(%rcx), %xmm15 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm15, %xmm14 -; AVX512DQ-BW-NEXT: vmovdqa64 (%rdx), %xmm24 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm18 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm18, %xmm16 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm16[0],xmm14[0],xmm16[1],xmm14[1],xmm16[2],xmm14[2],xmm16[3],xmm14[3],xmm16[4],xmm14[4],xmm16[5],xmm14[5],xmm16[6],xmm14[6],xmm16[7],xmm14[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm18[0],xmm15[0],xmm18[1],xmm15[1],xmm18[2],xmm15[2],xmm18[3],xmm15[3],xmm18[4],xmm15[4],xmm18[5],xmm15[5],xmm18[6],xmm15[6],xmm18[7],xmm15[7] -; AVX512DQ-BW-NEXT: vprold $16, %xmm16, %xmm16 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm14, %zmm16, %zmm14 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm14 = zmm14[0,0,0,1,4,4,4,5] -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm14, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %xmm14 -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm16 = xmm14[0],zero,xmm14[1],zero,xmm14[2],zero,xmm14[3],zero,xmm14[4],zero,xmm14[5],zero,xmm14[6],zero,xmm14[7],zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm26 = xmm14[2,1,2,3] -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm26 = xmm26[0],zero,xmm26[1],zero,xmm26[2],zero,xmm26[3],zero,xmm26[4],zero,xmm26[5],zero,xmm26[6],zero,xmm26[7],zero -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm27 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] -; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm27, %zmm16 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm16, %zmm6 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm16 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm16[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm28 = xmm16[2,1,2,3] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm28 = xmm28[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm28, %zmm27, %zmm26 -; AVX512DQ-BW-NEXT: movabsq $585610922974906400, %rcx # imm = 0x820820820820820 -; AVX512DQ-BW-NEXT: kmovq %rcx, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm26, %zmm6 {%k3} -; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm21, %xmm26 -; AVX512DQ-BW-NEXT: vpshufb %xmm20, %xmm22, %xmm20 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm26 = xmm20[8],xmm26[8],xmm20[9],xmm26[9],xmm20[10],xmm26[10],xmm20[11],xmm26[11],xmm20[12],xmm26[12],xmm20[13],xmm26[13],xmm20[14],xmm26[14],xmm20[15],xmm26[15] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm25, %zmm20 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm23, %xmm25 -; AVX512DQ-BW-NEXT: vpshufb %xmm12, %xmm24, %xmm12 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm25[0],xmm12[1],xmm25[1],xmm12[2],xmm25[2],xmm12[3],xmm25[3],xmm12[4],xmm25[4],xmm12[5],xmm25[5],xmm12[6],xmm25[6],xmm12[7],xmm25[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7] -; AVX512DQ-BW-NEXT: vprold $16, %xmm25, %xmm25 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm25, %zmm12 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm12 = zmm12[0,0,0,1,4,4,4,5] -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm20 {%k2} -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm12 = xmm13[0],zero,xmm13[1],zero,xmm13[2],zero,xmm13[3],zero,xmm13[4],zero,xmm13[5],zero,xmm13[6],zero,xmm13[7],zero -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm13[2,1,2,3] -; AVX512DQ-BW-NEXT: vpmovzxbw {{.*#+}} xmm25 = xmm25[0],zero,xmm25[1],zero,xmm25[2],zero,xmm25[3],zero,xmm25[4],zero,xmm25[5],zero,xmm25[6],zero,xmm25[7],zero -; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm27, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm12, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm11[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm25 = xmm11[2,1,2,3] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm25[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-NEXT: vpermt2w %zmm25, %zmm27, %zmm12 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm12, %zmm20 {%k3} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm12 = ymm19[0],ymm17[0],ymm19[1],ymm17[1],ymm19[2],ymm17[2],ymm19[3],ymm17[3],ymm19[4],ymm17[4],ymm19[5],ymm17[5],ymm19[6],ymm17[6],ymm19[7],ymm17[7],ymm19[16],ymm17[16],ymm19[17],ymm17[17],ymm19[18],ymm17[18],ymm19[19],ymm17[19],ymm19[20],ymm17[20],ymm19[21],ymm17[21],ymm19[22],ymm17[22],ymm19[23],ymm17[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm19 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512DQ-BW-NEXT: vpermt2w %zmm12, %zmm19, %zmm17 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm20, %zmm15 {%k2} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm15, %ymm19 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512DQ-BW-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm5, %ymm22 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512DQ-BW-NEXT: movw $18724, %r10w # imm = 0x4924 +; AVX512DQ-BW-NEXT: kmovd %r10d, %k3 +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm22, %ymm19 {%k3} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512DQ-BW-NEXT: vpermt2w %ymm22, %ymm23, %ymm15 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm19, %zmm15, %zmm15 +; AVX512DQ-BW-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] +; AVX512DQ-BW-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm8, %ymm22 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] +; AVX512DQ-BW-NEXT: vpermt2w %zmm22, %zmm25, %zmm24 +; AVX512DQ-BW-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 +; AVX512DQ-BW-NEXT: kmovq %r10, %k4 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm24, %zmm15 {%k4} +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %ymm22 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %ymm24 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm21, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqa 32(%rsi), %ymm11 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %ymm21 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm11[0],ymm21[1],ymm11[1],ymm21[2],ymm11[2],ymm21[3],ymm11[3],ymm21[4],ymm11[4],ymm21[5],ymm11[5],ymm21[6],ymm11[6],ymm21[7],ymm11[7],ymm21[16],ymm11[16],ymm21[17],ymm11[17],ymm21[18],ymm11[18],ymm21[19],ymm11[19],ymm21[20],ymm11[20],ymm21[21],ymm11[21],ymm21[22],ymm11[22],ymm21[23],ymm11[23] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm18, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm10, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vextracti64x4 $1, %zmm4, %ymm6 +; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512DQ-BW-NEXT: vpshufb %ymm20, %ymm10, %ymm10 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-BW-NEXT: vmovdqu16 %ymm10, %ymm6 {%k3} +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpermt2w %ymm9, %ymm23, %ymm4 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm4, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %ymm6 +; AVX512DQ-BW-NEXT: vpshufb %ymm19, %ymm6, %ymm6 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm6, %zmm25, %zmm9 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm9, %zmm4 {%k4} +; AVX512DQ-BW-NEXT: vpbroadcastq {{.*#+}} ymm6 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm11, %ymm9 +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm21, %ymm10 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm9 = ymm10[0],ymm9[0],ymm10[1],ymm9[1],ymm10[2],ymm9[2],ymm10[3],ymm9[3],ymm10[4],ymm9[4],ymm10[5],ymm9[5],ymm10[6],ymm9[6],ymm10[7],ymm9[7],ymm10[16],ymm9[16],ymm10[17],ymm9[17],ymm10[18],ymm9[18],ymm10[19],ymm9[19],ymm10[20],ymm9[20],ymm10[21],ymm9[21],ymm10[22],ymm9[22],ymm10[23],ymm9[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512DQ-BW-NEXT: vpermt2w %zmm9, %zmm12, %zmm10 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm17, %zmm10 {%k1} -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512DQ-BW-NEXT: # ymm9 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm13, %ymm17 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm13 = xmm13[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm21 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] -; AVX512DQ-BW-NEXT: vpermt2w %zmm17, %zmm21, %zmm13 -; AVX512DQ-BW-NEXT: movl $1227133513, %ecx # imm = 0x49249249 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm13, %zmm10 {%k2} -; AVX512DQ-BW-NEXT: vbroadcasti128 {{.*#+}} ymm13 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] -; AVX512DQ-BW-NEXT: # ymm13 = mem[0,1,0,1] -; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm11, %ymm17 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm17, %zmm21, %zmm11 -; AVX512DQ-BW-NEXT: movabsq $2342443691899625602, %rcx # imm = 0x2082082082082082 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm10 = ymm21[8],ymm11[8],ymm21[9],ymm11[9],ymm21[10],ymm11[10],ymm21[11],ymm11[11],ymm21[12],ymm11[12],ymm21[13],ymm11[13],ymm21[14],ymm11[14],ymm21[15],ymm11[15],ymm21[24],ymm11[24],ymm21[25],ymm11[25],ymm21[26],ymm11[26],ymm21[27],ymm11[27],ymm21[28],ymm11[28],ymm21[29],ymm11[29],ymm21[30],ymm11[30],ymm21[31],ymm11[31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQ-BW-NEXT: vpermw %ymm10, %ymm11, %ymm10 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm10, %zmm9, %zmm9 +; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm22, %ymm10 +; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm24, %ymm12 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQ-BW-NEXT: vpermw %ymm12, %ymm18, %ymm12 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm9, %zmm10 {%k2} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm9 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] +; AVX512DQ-BW-NEXT: vpshufb %zmm9, %zmm2, %zmm2 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm2, %zmm10 {%k1} +; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 ; AVX512DQ-BW-NEXT: kmovq %rcx, %k3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm11, %zmm10 {%k3} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm3 = ymm4[0],ymm3[0],ymm4[1],ymm3[1],ymm4[2],ymm3[2],ymm4[3],ymm3[3],ymm4[4],ymm3[4],ymm4[5],ymm3[5],ymm4[6],ymm3[6],ymm4[7],ymm3[7],ymm4[16],ymm3[16],ymm4[17],ymm3[17],ymm4[18],ymm3[18],ymm4[19],ymm3[19],ymm4[20],ymm3[20],ymm4[21],ymm3[21],ymm4[22],ymm3[22],ymm4[23],ymm3[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm18[8],xmm15[8],xmm18[9],xmm15[9],xmm18[10],xmm15[10],xmm18[11],xmm15[11],xmm18[12],xmm15[12],xmm18[13],xmm15[13],xmm18[14],xmm15[14],xmm18[15],xmm15[15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm3, %zmm19, %zmm4 +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm10 {%k3} +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm16, %ymm1 +; AVX512DQ-BW-NEXT: vpshufb %ymm6, %ymm17, %ymm2 ; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm8[8],xmm7[8],xmm8[9],xmm7[9],xmm8[10],xmm7[10],xmm8[11],xmm7[11],xmm8[12],xmm7[12],xmm8[13],xmm7[13],xmm8[14],xmm7[14],xmm8[15],xmm7[15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm2 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm4, %zmm2 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa 32(%r8), %ymm1 -; AVX512DQ-BW-NEXT: vpshufb %ymm9, %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm14[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu16 %zmm3, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512DQ-BW-NEXT: vpshufb %ymm13, %ymm1, %ymm1 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm16[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm21, %zmm3 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm3, %zmm2 {%k3} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 256(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm6, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 320(%rax) +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm17[8],ymm16[8],ymm17[9],ymm16[9],ymm17[10],ymm16[10],ymm17[11],ymm16[11],ymm17[12],ymm16[12],ymm17[13],ymm16[13],ymm17[14],ymm16[14],ymm17[15],ymm16[15],ymm17[24],ymm16[24],ymm17[25],ymm16[25],ymm17[26],ymm16[26],ymm17[27],ymm16[27],ymm17[28],ymm16[28],ymm17[29],ymm16[29],ymm17[30],ymm16[30],ymm17[31],ymm16[31] +; AVX512DQ-BW-NEXT: vpermw %ymm2, %ymm11, %ymm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm13, %ymm2 +; AVX512DQ-BW-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] +; AVX512DQ-BW-NEXT: vpermw %ymm2, %ymm18, %ymm2 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} +; AVX512DQ-BW-NEXT: vpshufb %zmm9, %zmm5, %zmm1 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3} +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm15, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; ; AVX512DQ-BW-FCP-LABEL: store_i8_stride6_vf64: ; AVX512DQ-BW-FCP: # %bb.0: -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm10 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm10, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm1[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm11[8],ymm10[8],ymm11[9],ymm10[9],ymm11[10],ymm10[10],ymm11[11],ymm10[11],ymm11[12],ymm10[12],ymm11[13],ymm10[13],ymm11[14],ymm10[14],ymm11[15],ymm10[15],ymm11[24],ymm10[24],ymm11[25],ymm10[25],ymm11[26],ymm10[26],ymm11[27],ymm10[27],ymm11[28],ymm10[28],ymm11[29],ymm10[29],ymm11[30],ymm10[30],ymm11[31],ymm10[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm0 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm3, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %ymm4 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm2 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm5 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm7 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm8 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm9 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rcx), %xmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm16 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm24 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm26 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm16, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm20, %xmm13 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm16[0],xmm20[1],xmm16[1],xmm20[2],xmm16[2],xmm20[3],xmm16[3],xmm20[4],xmm16[4],xmm20[5],xmm16[5],xmm20[6],xmm16[6],xmm20[7],xmm16[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm12, %zmm18, %zmm13 -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm23 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm15, %xmm12 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm19, %xmm14 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm19[0],xmm15[0],xmm19[1],xmm15[1],xmm19[2],xmm15[2],xmm19[3],xmm15[3],xmm19[4],xmm15[4],xmm19[5],xmm15[5],xmm19[6],xmm15[6],xmm19[7],xmm15[7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm14, %zmm25, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm28 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm27 = [8,9,0,0,0,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm28, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm14 = xmm28[0],zero,xmm28[1],zero,xmm28[2],zero,xmm28[3],zero,xmm28[4],zero,xmm28[5],zero,xmm28[6],zero,xmm28[7],zero -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm30 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm17, %zmm30, %zmm14 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm29 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm21, %xmm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm2, %xmm22, %xmm31 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm31 = xmm31[0],xmm17[0],xmm31[1],xmm17[1],xmm31[2],xmm17[2],xmm31[3],xmm17[3],xmm31[4],xmm17[4],xmm31[5],xmm17[5],xmm31[6],xmm17[6],xmm31[7],xmm17[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm18, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm31 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm24, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm23, %xmm26, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm25, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm31, %xmm29, %xmm25 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm29[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm30, %zmm23 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm27, %xmm8, %xmm27 -; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm25 = xmm8[0],zero,xmm8[1],zero,xmm8[2],zero,xmm8[3],zero,xmm8[4],zero,xmm8[5],zero,xmm8[6],zero,xmm8[7],zero -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm27, %zmm30, %zmm25 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm31, %xmm9, %xmm31 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm9[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm30, %zmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %ymm30 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm31 = ymm30[0],ymm7[0],ymm30[1],ymm7[1],ymm30[2],ymm7[2],ymm30[3],ymm7[3],ymm30[4],ymm7[4],ymm30[5],ymm7[5],ymm30[6],ymm7[6],ymm30[7],ymm7[7],ymm30[16],ymm7[16],ymm30[17],ymm7[17],ymm30[18],ymm7[18],ymm30[19],ymm7[19],ymm30[20],ymm7[20],ymm30[21],ymm7[21],ymm30[22],ymm7[22],ymm30[23],ymm7[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm0, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm31 = ymm6[0],ymm5[0],ymm6[1],ymm5[1],ymm6[2],ymm5[2],ymm6[3],ymm5[3],ymm6[4],ymm5[4],ymm6[5],ymm5[5],ymm6[6],ymm5[6],ymm6[7],ymm5[7],ymm6[16],ymm5[16],ymm6[17],ymm5[17],ymm6[18],ymm5[18],ymm6[19],ymm5[19],ymm6[20],ymm5[20],ymm6[21],ymm5[21],ymm6[22],ymm5[22],ymm6[23],ymm5[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm24 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm24, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %ymm26 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm20[8],xmm16[8],xmm20[9],xmm16[9],xmm20[10],xmm16[10],xmm20[11],xmm16[11],xmm20[12],xmm16[12],xmm20[13],xmm16[13],xmm20[14],xmm16[14],xmm20[15],xmm16[15] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm20 = ymm26[0],ymm4[0],ymm26[1],ymm4[1],ymm26[2],ymm4[2],ymm26[3],ymm4[3],ymm26[4],ymm4[4],ymm26[5],ymm4[5],ymm26[6],ymm4[6],ymm26[7],ymm4[7],ymm26[16],ymm4[16],ymm26[17],ymm4[17],ymm26[18],ymm4[18],ymm26[19],ymm4[19],ymm26[20],ymm4[20],ymm26[21],ymm4[21],ymm26[22],ymm4[22],ymm26[23],ymm4[23] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm0, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm1 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] -; AVX512DQ-BW-FCP-NEXT: # ymm1 = mem[0,1,0,1] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm8, %ymm31 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm8[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm0 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm31, %zmm0, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm11 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm19[8],xmm15[8],xmm19[9],xmm15[9],xmm19[10],xmm15[10],xmm19[11],xmm15[11],xmm19[12],xmm15[12],xmm19[13],xmm15[13],xmm19[14],xmm15[14],xmm19[15],xmm15[15] +; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm0 = [5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10,5,8,7,6,9,0,0,10] +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rcx), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm6, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdx), %xmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdx), %xmm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm8, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3],xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm5 = [1,0,3,2,1,0,3,2,1,0,3,2,5,4,7,6,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm5, %zmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rsi), %xmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} xmm7 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm9, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %xmm18 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rdi), %xmm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm10, %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm11[8],xmm3[8],xmm11[9],xmm3[9],xmm11[10],xmm3[10],xmm11[11],xmm3[11],xmm11[12],xmm3[12],xmm11[13],xmm3[13],xmm11[14],xmm3[14],xmm11[15],xmm3[15] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm12 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,3,2,1,0,3,2,1,0,3,2,1,4,5,6,5,32,33,34,35,32,33,34,35,32,33,34,35,36,37,38,39] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm13, %zmm12 +; AVX512DQ-BW-FCP-NEXT: movl $-1840700270, %r10d # imm = 0x92492492 +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k1 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm4, %zmm12 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %xmm11 +; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm3 = xmm11[0],zero,xmm11[1],zero,xmm11[2],zero,xmm11[3],zero,xmm11[4],zero,xmm11[5],zero,xmm11[6],zero,xmm11[7],zero +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm12, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm3, %ymm14, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm19 = [16,37,18,19,38,21,22,39,24,25,32,27,28,33,30,31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} xmm20 = [8,9,0,0,0,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm11, %xmm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm19, %zmm12 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm4, %zmm3 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} xmm21 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm12, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm12[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm23 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,38,37,32,39,38,37,32,39,38,37,32,39,33,33,33,33] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm23, %zmm22 +; AVX512DQ-BW-FCP-NEXT: movabsq $585610922974906400, %r10 # imm = 0x820820820820820 +; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm22, %zmm3 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm15, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm0, %xmm16, %xmm22 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm22[0],xmm4[0],xmm22[1],xmm4[1],xmm22[2],xmm4[2],xmm22[3],xmm4[3],xmm22[4],xmm4[4],xmm22[5],xmm4[5],xmm22[6],xmm4[6],xmm22[7],xmm4[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm16[0],xmm15[0],xmm16[1],xmm15[1],xmm16[2],xmm15[2],xmm16[3],xmm15[3],xmm16[4],xmm15[4],xmm16[5],xmm15[5],xmm16[6],xmm15[6],xmm16[7],xmm15[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm5, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm17, %xmm4 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm7, %xmm18, %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm13, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm5 {%k1} +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpmovzxbw {{.*#+}} xmm7 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero +; AVX512DQ-BW-FCP-NEXT: vpermi2w %ymm7, %ymm5, %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm20, %xmm4, %xmm7 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm7, %zmm19, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm14, %zmm5 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti64x4 {{.*#+}} zmm7 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm21, %xmm7, %xmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm14 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm23, %zmm14 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm14, %zmm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm13 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm14 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm14[0],ymm13[0],ymm14[1],ymm13[1],ymm14[2],ymm13[2],ymm14[3],ymm13[3],ymm14[4],ymm13[4],ymm14[5],ymm13[5],ymm14[6],ymm13[6],ymm14[7],ymm13[7],ymm14[16],ymm13[16],ymm14[17],ymm13[17],ymm14[18],ymm13[18],ymm14[19],ymm13[19],ymm14[20],ymm13[20],ymm14[21],ymm13[21],ymm14[22],ymm13[22],ymm14[23],ymm13[23] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm16[8],xmm15[8],xmm16[9],xmm15[9],xmm16[10],xmm15[10],xmm16[11],xmm15[11],xmm16[12],xmm15[12],xmm16[13],xmm15[13],xmm16[14],xmm15[14],xmm16[15],xmm15[15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm21 = [5,4,3,6,5,4,3,6,5,4,3,6,7,7,7,7,41,40,43,42,41,40,43,42,41,40,43,42,45,44,47,46] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm21, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm15 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%rdi), %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm19 = ymm16[0],ymm15[0],ymm16[1],ymm15[1],ymm16[2],ymm15[2],ymm16[3],ymm15[3],ymm16[4],ymm15[4],ymm16[5],ymm15[5],ymm16[6],ymm15[6],ymm16[7],ymm15[7],ymm16[16],ymm15[16],ymm16[17],ymm15[17],ymm16[18],ymm15[18],ymm16[19],ymm15[19],ymm16[20],ymm15[20],ymm16[21],ymm15[21],ymm16[22],ymm15[22],ymm16[23],ymm15[23] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm18 = [4,3,6,5,4,3,6,5,4,3,6,5,7,7,7,7,40,43,42,41,40,43,42,41,40,43,42,41,44,45,46,45] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm19, %zmm18, %zmm17 +; AVX512DQ-BW-FCP-NEXT: movl $613566756, %r10d # imm = 0x24924924 +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k2 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm17 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm17, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm20 = [2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0] +; AVX512DQ-BW-FCP-NEXT: # ymm20 = mem[0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm4, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm22 = ymm22[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: movw $18724, %r10w # imm = 0x4924 +; AVX512DQ-BW-FCP-NEXT: kmovd %r10d, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm22, %ymm19 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm23 = [18,1,2,19,4,5,20,7,8,21,10,11,22,13,14,23] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm22, %ymm23, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm17, %zmm17 ; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} ymm19 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0] ; AVX512DQ-BW-FCP-NEXT: # ymm19 = mem[0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm24, %zmm10 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm11, %zmm0, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %ymm11 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm1, %ymm11, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm28[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm1, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm29[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm1, %zmm0, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm4, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm26, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[16],ymm0[16],ymm1[17],ymm0[17],ymm1[18],ymm0[18],ymm1[19],ymm0[19],ymm1[20],ymm0[20],ymm1[21],ymm0[21],ymm1[22],ymm0[22],ymm1[23],ymm0[23] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm1 = ymm26[8],ymm4[8],ymm26[9],ymm4[9],ymm26[10],ymm4[10],ymm26[11],ymm4[11],ymm26[12],ymm4[12],ymm26[13],ymm4[13],ymm26[14],ymm4[14],ymm26[15],ymm4[15],ymm26[24],ymm4[24],ymm26[25],ymm4[25],ymm26[26],ymm4[26],ymm26[27],ymm4[27],ymm26[28],ymm4[28],ymm26[29],ymm4[29],ymm26[30],ymm4[30],ymm26[31],ymm4[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm24 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm1, %ymm24, %ymm1 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r8), %zmm0 -; AVX512DQ-BW-FCP-NEXT: movl $613566756, %eax # imm = 0x24924924 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm3, %zmm4 {%k1} -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm1 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm1, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-FCP-NEXT: movl $-1840700270, %eax # imm = 0x92492492 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 (%r9), %zmm0 -; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm0 = zmm0[4,5,6,7,4,5,6,7] -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm0[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-FCP-NEXT: movabsq $-9076969306111049208, %rax # imm = 0x8208208208208208 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm4 {%k3} -; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm3 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm5, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm3, %ymm6, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[4],ymm0[4],ymm3[5],ymm0[5],ymm3[6],ymm0[6],ymm3[7],ymm0[7],ymm3[16],ymm0[16],ymm3[17],ymm0[17],ymm3[18],ymm0[18],ymm3[19],ymm0[19],ymm3[20],ymm0[20],ymm3[21],ymm0[21],ymm3[22],ymm0[22],ymm3[23],ymm0[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm6[8],ymm5[8],ymm6[9],ymm5[9],ymm6[10],ymm5[10],ymm6[11],ymm5[11],ymm6[12],ymm5[12],ymm6[13],ymm5[13],ymm6[14],ymm5[14],ymm6[15],ymm5[15],ymm6[24],ymm5[24],ymm6[25],ymm5[25],ymm6[26],ymm5[26],ymm6[27],ymm5[27],ymm6[28],ymm5[28],ymm6[29],ymm5[29],ymm6[30],ymm5[30],ymm6[31],ymm5[31] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm5 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm3, %ymm5, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm7, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm7[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm25 = [2,5,4,3,2,5,4,3,2,5,4,3,6,5,6,7,40,41,42,43,40,41,42,43,40,41,42,43,44,45,46,47] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm22, %zmm25, %zmm24 +; AVX512DQ-BW-FCP-NEXT: movabsq $2342443691899625602, %r10 # imm = 0x2082082082082082 +; AVX512DQ-BW-FCP-NEXT: kmovq %r10, %k4 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm24, %zmm17 {%k4} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %ymm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm24[0],ymm22[0],ymm24[1],ymm22[1],ymm24[2],ymm22[2],ymm24[3],ymm22[3],ymm24[4],ymm22[4],ymm24[5],ymm22[5],ymm24[6],ymm22[6],ymm24[7],ymm22[7],ymm24[16],ymm22[16],ymm24[17],ymm22[17],ymm24[18],ymm22[18],ymm24[19],ymm22[19],ymm24[20],ymm22[20],ymm24[21],ymm22[21],ymm24[22],ymm22[22],ymm24[23],ymm22[23] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm21, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%rsi), %ymm8 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm26 = ymm21[0],ymm8[0],ymm21[1],ymm8[1],ymm21[2],ymm8[2],ymm21[3],ymm8[3],ymm21[4],ymm8[4],ymm21[5],ymm8[5],ymm21[6],ymm8[6],ymm21[7],ymm8[7],ymm21[16],ymm8[16],ymm21[17],ymm8[17],ymm21[18],ymm8[18],ymm21[19],ymm8[19],ymm21[20],ymm8[20],ymm21[21],ymm8[21],ymm21[22],ymm8[22],ymm21[23],ymm8[23] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm26, %zmm18, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm6, %zmm9 {%k2} +; AVX512DQ-BW-FCP-NEXT: vextracti64x4 $1, %zmm9, %ymm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm20, %ymm10, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %ymm10, %ymm6 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm11[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %ymm10, %ymm23, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm9, %zmm6 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r9), %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm19, %ymm9, %ymm9 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm12[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm9, %zmm25, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm10, %zmm6 {%k4} +; AVX512DQ-BW-FCP-NEXT: vpbroadcastq {{.*#+}} ymm9 = [8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0,8,7,6,9,0,0,10,0] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm8, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm21, %ymm11 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm11[0],ymm10[0],ymm11[1],ymm10[1],ymm11[2],ymm10[2],ymm11[3],ymm10[3],ymm11[4],ymm10[4],ymm11[5],ymm10[5],ymm11[6],ymm10[6],ymm11[7],ymm10[7],ymm11[16],ymm10[16],ymm11[17],ymm10[17],ymm11[18],ymm10[18],ymm11[19],ymm10[19],ymm11[20],ymm10[20],ymm11[21],ymm10[21],ymm11[22],ymm10[22],ymm11[23],ymm10[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm8 = ymm21[8],ymm8[8],ymm21[9],ymm8[9],ymm21[10],ymm8[10],ymm21[11],ymm8[11],ymm21[12],ymm8[12],ymm21[13],ymm8[13],ymm21[14],ymm8[14],ymm21[15],ymm8[15],ymm21[24],ymm8[24],ymm21[25],ymm8[25],ymm21[26],ymm8[26],ymm21[27],ymm8[27],ymm21[28],ymm8[28],ymm21[29],ymm8[29],ymm21[30],ymm8[30],ymm21[31],ymm8[31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm11 = [12,11,14,13,12,11,14,13,12,11,14,13,15,15,15,15] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm8, %ymm11, %ymm8 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm10, %zmm8 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm22, %ymm10 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm24, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm10 = ymm12[0],ymm10[0],ymm12[1],ymm10[1],ymm12[2],ymm10[2],ymm12[3],ymm10[3],ymm12[4],ymm10[4],ymm12[5],ymm10[5],ymm12[6],ymm10[6],ymm12[7],ymm10[7],ymm12[16],ymm10[16],ymm12[17],ymm10[17],ymm12[18],ymm10[18],ymm12[19],ymm10[19],ymm12[20],ymm10[20],ymm12[21],ymm10[21],ymm12[22],ymm10[22],ymm12[23],ymm10[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm12 = ymm24[8],ymm22[8],ymm24[9],ymm22[9],ymm24[10],ymm22[10],ymm24[11],ymm22[11],ymm24[12],ymm22[12],ymm24[13],ymm22[13],ymm24[14],ymm22[14],ymm24[15],ymm22[15],ymm24[24],ymm22[24],ymm24[25],ymm22[25],ymm24[26],ymm22[26],ymm24[27],ymm22[27],ymm24[28],ymm22[28],ymm24[29],ymm22[29],ymm24[30],ymm22[30],ymm24[31],ymm22[31] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} ymm18 = [13,12,11,14,13,12,11,14,13,12,11,14,15,15,15,15] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm12, %ymm18, %ymm12 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm12, %zmm10, %zmm10 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm8, %zmm10 {%k2} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [6,5,8,7,0,9,0,0,6,5,8,7,0,9,0,0,10,13,12,11,14,0,0,15,10,13,12,11,14,0,0,15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm8, %zmm2, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm2 = zmm2[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm2, %zmm10 {%k1} +; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7] +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[u,6,u,5,u,8,u,7,u,u,u,9,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,42,u,45,u,44,u,43,u,46,u,u,u,u,u,47,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-FCP-NEXT: movabsq $-9076969306111049208, %rcx # imm = 0x8208208208208208 +; AVX512DQ-BW-FCP-NEXT: kmovq %rcx, %k3 +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm10 {%k3} +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm15, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm9, %ymm16, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[16],ymm1[16],ymm2[17],ymm1[17],ymm2[18],ymm1[18],ymm2[19],ymm1[19],ymm2[20],ymm1[20],ymm2[21],ymm1[21],ymm2[22],ymm1[22],ymm2[23],ymm1[23] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm16[8],ymm15[8],ymm16[9],ymm15[9],ymm16[10],ymm15[10],ymm16[11],ymm15[11],ymm16[12],ymm15[12],ymm16[13],ymm15[13],ymm16[14],ymm15[14],ymm16[15],ymm15[15],ymm16[24],ymm15[24],ymm16[25],ymm15[25],ymm16[26],ymm15[26],ymm16[27],ymm15[27],ymm16[28],ymm15[28],ymm16[29],ymm15[29],ymm16[30],ymm15[30],ymm16[31],ymm15[31] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm2, %ymm11, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm13, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm0, %ymm14, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[16],ymm2[16],ymm0[17],ymm2[17],ymm0[18],ymm2[18],ymm0[19],ymm2[19],ymm0[20],ymm2[20],ymm0[21],ymm2[21],ymm0[22],ymm2[22],ymm0[23],ymm2[23] ; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm7, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm2, %ymm30, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} ymm2 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[16],ymm3[16],ymm2[17],ymm3[17],ymm2[18],ymm3[18],ymm2[19],ymm3[19],ymm2[20],ymm3[20],ymm2[21],ymm3[21],ymm2[22],ymm3[22],ymm2[23],ymm3[23] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm30[8],ymm7[8],ymm30[9],ymm7[9],ymm30[10],ymm7[10],ymm30[11],ymm7[11],ymm30[12],ymm7[12],ymm30[13],ymm7[13],ymm30[14],ymm7[14],ymm30[15],ymm7[15],ymm30[24],ymm7[24],ymm30[25],ymm7[25],ymm30[26],ymm7[26],ymm30[27],ymm7[27],ymm30[28],ymm7[28],ymm30[29],ymm7[29],ymm30[30],ymm7[30],ymm30[31],ymm7[31] -; AVX512DQ-BW-FCP-NEXT: vpermw %ymm3, %ymm24, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3] -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm1, %zmm8, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm0, %zmm2 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm0 = zmm9[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] -; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,2,2,3,6,6,6,7] -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm0, %zmm2 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm22, %zmm21 {%k1} -; AVX512DQ-BW-FCP-NEXT: movl $1227133513, %eax # imm = 0x49249249 -; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k3 -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm20, %zmm21 {%k3} -; AVX512DQ-BW-FCP-NEXT: movabsq $2342443691899625602, %rax # imm = 0x2082082082082082 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k4 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm15, %zmm21 {%k4} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm13, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm14, %zmm12 {%k1} -; AVX512DQ-BW-FCP-NEXT: movabsq $585610922974906400, %rax # imm = 0x820820820820820 -; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k5 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm23, %zmm12 {%k5} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm17, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm25, %zmm18 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm27, %zmm18 {%k5} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm16, %zmm10 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm11, %zmm10 {%k3} -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm19, %zmm10 {%k4} -; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm12, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm14[8],ymm13[8],ymm14[9],ymm13[9],ymm14[10],ymm13[10],ymm14[11],ymm13[11],ymm14[12],ymm13[12],ymm14[13],ymm13[13],ymm14[14],ymm13[14],ymm14[15],ymm13[15],ymm14[24],ymm13[24],ymm14[25],ymm13[25],ymm14[26],ymm13[26],ymm14[27],ymm13[27],ymm14[28],ymm13[28],ymm14[29],ymm13[29],ymm14[30],ymm13[30],ymm14[31],ymm13[31] +; AVX512DQ-BW-FCP-NEXT: vpermw %ymm2, %ymm18, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm0, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k2} +; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm8, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} zmm1 = zmm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,u,21,u,24,u,23,u,u,u,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,58,u,61,u,60,u,59,u,62,u,u,u,u,u,63] +; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,2,2,3,6,6,6,7] +; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm1, %zmm0 {%k3} +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm10, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm6, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm3, 192(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index f4055a953badd..25e489eef9d11 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -925,16 +925,14 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u] -; AVX2-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpor %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero ; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28] ; AVX2-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero ; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero ; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-NEXT: vmovq %xmm0, 48(%rax) @@ -967,16 +965,14 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm5, %ymm6, %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm3, %ymm5, %ymm3 +; AVX2-FP-NEXT: vpor %ymm3, %ymm5, %ymm3 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28],zero,zero ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,ymm4[2,10,18,26],zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28] ; AVX2-FP-NEXT: vpor %ymm2, %ymm4, %ymm2 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,ymm0[0],zero,zero,zero,zero,zero,zero,ymm0[1],zero,zero,zero,zero,ymm0[18,26],zero,zero,zero,zero,zero,ymm0[19,27],zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm1[0,8],zero,zero,zero,zero,zero,ymm1[1,9],zero,zero,zero,zero,zero,zero,zero,ymm1[18],zero,zero,zero,zero,zero,zero,ymm1[19],zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; AVX2-FP-NEXT: vpor %ymm0, %ymm2, %ymm0 ; AVX2-FP-NEXT: vmovdqa %ymm0, (%rax) ; AVX2-FP-NEXT: vextracti128 $1, %ymm3, %xmm0 ; AVX2-FP-NEXT: vmovq %xmm0, 48(%rax) @@ -1205,24 +1201,21 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6] -; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 -; AVX512BW-NEXT: kmovq %rcx, %k1 -; AVX512BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) -; AVX512BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 -; AVX512BW-NEXT: vmovq %xmm0, 48(%rax) -; AVX512BW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512BW-NEXT: vpermq {{.*#+}} zmm2 = zmm0[0,2,1,3,4,6,5,7] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zmm2[18],zero,zero,zero,zero,zero,zero,zmm2[19],zero,zero,zero,zero,zmm2[36,44],zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46,54],zero,zero,zero,zero,zero,zero,zmm2[55],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6] +; AVX512BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm2 +; AVX512BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) +; AVX512BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; AVX512BW-NEXT: vmovq %xmm1, 48(%rax) +; AVX512BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -1283,24 +1276,21 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm3 = mem[0],zero ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 ; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2] -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm0[0,2,1,3,4,6,5,7] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zero,zero,zmm1[18],zero,zero,zero,zero,zero,zero,zmm1[19],zero,zero,zero,zero,zmm1[36,44],zero,zero,zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46,54],zero,zero,zero,zero,zero,zero,zmm1[55],zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm2, %zmm2, %zmm1 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[0,8],zero,zero,zero,zero,zero,zmm1[1,9],zero,zero,zero,zero,zero,zmm1[2,10,18,26],zero,zero,zero,zero,zero,zmm1[19,27],zero,zero,zero,zero,zero,zmm1[20,28],zero,zero,zero,zmm1[37,45],zero,zero,zero,zero,zero,zmm1[38,46],zero,zero,zero,zero,zero,zero,zero,zmm1[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vshufi64x2 {{.*#+}} zmm2 = zmm2[2,3,0,1,2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm2[19,27],zero,zero,zero,zero,zero,zmm2[20,28],zero,zero,zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46],zero,zero,zero,zmm2[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512DQ-BW-NEXT: movabsq $63546854584629360, %rcx # imm = 0xE1C3870E1C3870 -; AVX512DQ-BW-NEXT: kmovq %rcx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %zmm0, %zmm1 {%k1} -; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm1, 32(%rax) -; AVX512DQ-BW-NEXT: vextracti32x4 $3, %zmm1, %xmm0 -; AVX512DQ-BW-NEXT: vmovq %xmm0, 48(%rax) -; AVX512DQ-BW-NEXT: vmovdqa %ymm1, (%rax) +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm0 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm2 = zmm0[0,2,1,3,4,6,5,7] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[0,8],zero,zero,zero,zero,zero,zmm2[1,9],zero,zero,zero,zero,zero,zero,zero,zmm2[18],zero,zero,zero,zero,zero,zero,zmm2[19],zero,zero,zero,zero,zmm2[36,44],zero,zero,zero,zero,zero,zmm2[37,45],zero,zero,zero,zero,zero,zmm2[38,46,54],zero,zero,zero,zero,zero,zero,zmm2[55],zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,0,2,5,7,4,6] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} zmm0 = zero,zero,zero,zero,zero,zero,zmm0[0],zero,zero,zero,zero,zero,zero,zmm0[1],zero,zero,zero,zero,zmm0[18,26],zero,zero,zero,zero,zero,zmm0[19,27],zero,zero,zero,zero,zero,zero,zero,zmm0[36],zero,zero,zero,zero,zero,zero,zmm0[37],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zmm0[55,63],zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} zmm0 = zmm0 | zmm1 | zmm2 +; AVX512DQ-BW-NEXT: vextracti32x4 $2, %zmm0, 32(%rax) +; AVX512DQ-BW-NEXT: vextracti32x4 $3, %zmm0, %xmm1 +; AVX512DQ-BW-NEXT: vmovq %xmm1, 48(%rax) +; AVX512DQ-BW-NEXT: vmovdqa %ymm0, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -1824,8 +1814,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm4[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero,zero,zero ; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] -; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5 +; AVX2-NEXT: vpor %ymm5, %ymm7, %ymm5 ; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] ; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[4],zero,zero,zero,zero,zero,zero,ymm8[5],zero,zero,zero,zero,zero,zero,ymm8[6],zero,zero,zero,zero,zero,ymm8[23],zero,zero,zero,zero,zero,zero,ymm8[24],zero,zero,zero @@ -1903,8 +1892,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm3[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm7, %ymm5, %ymm5 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255] -; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm4, %ymm5, %ymm5 +; AVX2-FP-NEXT: vpor %ymm4, %ymm5, %ymm5 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero ; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[4],zero,zero,zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero @@ -2323,19 +2311,17 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpor %ymm5, %ymm6, %ymm5 ; AVX512BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512BW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2] +; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,zero,zero,zero,ymm6[1,9],zero,zero,zero,zero,zero,ymm6[2,10],zero,zero,zero,zero,zero,ymm6[19,27],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero +; AVX512BW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] ; AVX512BW-NEXT: vpor %ymm7, %ymm6, %ymm6 -; AVX512BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vmovdqu8 %ymm5, %ymm6 {%k1} -; AVX512BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] -; AVX512BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,2,0,2] ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] -; AVX512BW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512BW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512BW-NEXT: vporq %zmm5, %zmm6, %zmm5 ; AVX512BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 ; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero ; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] @@ -2445,12 +2431,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero ; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm4[13],zero,zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,zero,xmm4[15] -; AVX512BW-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 ; AVX512BW-FCP-NEXT: vmovdqa %ymm5, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512BW-FCP-NEXT: vmovdqa %xmm2, 96(%rax) ; AVX512BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq @@ -2470,19 +2453,17 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm6, %ymm5 ; AVX512DQ-BW-NEXT: vinserti128 $1, (%rsi), %ymm1, %ymm1 +; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm6 = ymm1[0,2,0,2] +; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,zero,zero,zero,ymm6[1,9],zero,zero,zero,zero,zero,ymm6[2,10],zero,zero,zero,zero,zero,ymm6[19,27],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm5, %zmm6, %zmm5 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm1[5],zero,zero,zero,zero,zero,zero,ymm1[6],zero,zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,ymm1[24],zero,zero,zero,zero,zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm1[2,3,0,1] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[5],zero,zero,zero,zero,zero,zero,ymm7[6],zero,zero,zero,zero,zero,ymm7[23],zero,zero,zero,zero,zero,zero,ymm7[24],zero,zero,zero,zero,zero,zero,ymm7[25] ; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm6, %ymm6 -; AVX512DQ-BW-NEXT: movl $202911840, %ecx # imm = 0xC183060 -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vmovdqu8 %ymm5, %ymm6 {%k1} -; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm5 = ymm1[0,2,0,2] -; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,zero,zero,ymm5[1,9],zero,zero,zero,zero,zero,ymm5[2,10],zero,zero,zero,zero,zero,ymm5[19,27],zero,zero,zero,zero,zero,ymm5[20,28],zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm0[0,2,0,2] ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,zero,zero,ymm7[1,9],zero,zero,zero,zero,zero,ymm7[18,26],zero,zero,zero,zero,zero,ymm7[19,27],zero,zero,zero,zero,zero,ymm7[20,28] -; AVX512DQ-BW-NEXT: vpor %ymm5, %ymm7, %ymm5 -; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5 +; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm6, %zmm7, %zmm6 +; AVX512DQ-BW-NEXT: vporq %zmm5, %zmm6, %zmm5 ; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6 ; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[4],zero,zero,zero,zero,zero,zero,ymm6[5],zero,zero,zero,zero,zero,zero,ymm6[6],zero,zero,zero,zero,zero,zero,zero,ymm6[23],zero,zero,zero,zero,zero,zero,ymm6[24],zero,zero ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1] @@ -2592,12 +2573,9 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15] ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[10],zero,zero,zero,zero,zero,xmm1[13,12],zero,zero,zero,zero,zero,xmm1[15,14],zero ; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm4[13],zero,zero,zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,zero,zero,xmm4[15] -; AVX512DQ-BW-FCP-NEXT: vpor %xmm1, %xmm2, %xmm1 -; AVX512DQ-BW-FCP-NEXT: movw $-7741, %cx # imm = 0xE1C3 -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %xmm1, %xmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} xmm2 = xmm2 | xmm0 | xmm1 ; AVX512DQ-BW-FCP-NEXT: vmovdqa %ymm5, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, 96(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm2, 96(%rax) ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm7, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq @@ -3598,24 +3576,24 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2: # %bb.0: ; AVX2-NEXT: pushq %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: vmovdqa (%rdi), %ymm4 -; AVX2-NEXT: vmovdqa (%rsi), %ymm6 -; AVX2-NEXT: vmovdqa (%rdx), %ymm3 -; AVX2-NEXT: vmovdqa (%rcx), %ymm5 -; AVX2-NEXT: vmovdqa (%r8), %ymm7 +; AVX2-NEXT: vmovdqa (%rdi), %ymm7 ; AVX2-NEXT: vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vmovdqa (%r9), %ymm2 -; AVX2-NEXT: vmovdqa (%rax), %ymm1 +; AVX2-NEXT: vmovdqa (%rsi), %ymm3 +; AVX2-NEXT: vmovdqa (%rdx), %ymm1 ; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm3[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] +; AVX2-NEXT: vmovdqa (%rcx), %ymm2 +; AVX2-NEXT: vmovdqa (%r8), %ymm5 +; AVX2-NEXT: vmovdqa (%r9), %ymm6 +; AVX2-NEXT: vmovdqa (%rax), %ymm4 +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm8 = ymm1[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm8 = ymm8[0,2,3,3,4,6,7,7] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm9 = [255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0] ; AVX2-NEXT: # ymm9 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] -; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm4[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,u,28,u,u,u,u,31,u,29,u,u,u] +; AVX2-NEXT: vpshufhw {{.*#+}} ymm9 = ymm7[0,1,2,3,6,7,7,6,8,9,10,11,14,15,15,14] ; AVX2-NEXT: vpshufd {{.*#+}} ymm9 = ymm9[2,2,3,3,6,6,7,7] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm10 = [0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0] ; AVX2-NEXT: # ymm10 = mem[0,1,0,1] @@ -3623,13 +3601,13 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29] -; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[27],zero,ymm5[27,28,29,30],zero,ymm5[28],zero,ymm5[26,27,30,31],zero,ymm5[29] +; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm6[27],zero,zero,zero,zero,ymm6[30],zero,ymm6[28],zero,zero,zero,zero,ymm6[31],zero ; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8 ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255,0] ; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm8, %ymm0 @@ -3698,68 +3676,67 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] ; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm10 -; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] -; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm4[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] +; AVX2-NEXT: vpshufb {{.*#+}} ymm0 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,21,u,19,u,u,u,u,22,u,20,u,u] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload +; AVX2-NEXT: vpshuflw {{.*#+}} ymm1 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[0,0,1,1,4,4,5,5] ; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0,0,0,0,255,0,255,0,0,0,0,255,0,255,0,0,0] ; AVX2-NEXT: # ymm7 = mem[0,1,0,1] ; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero,ymm5[20] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22],zero,ymm2[20] +; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero,ymm13[20],zero ; AVX2-NEXT: vpor %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u] ; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm13[18],zero,zero,zero,zero,ymm13[21],zero,ymm13[19],zero,zero,zero,zero,ymm13[22],zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22] +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm5[18],zero,zero,zero,zero,ymm5[21],zero,ymm5[19],zero,zero,zero,zero,ymm5[22],zero ; AVX2-NEXT: vpor %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-NEXT: vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload -; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm12[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] +; AVX2-NEXT: vpshuflw {{.*#+}} ymm7 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15] ; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255] ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] ; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[23],zero,ymm4[27,20,21,26],zero,ymm4[24],zero,ymm4[26,27,26,27],zero,ymm4[25] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27],zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm12[23],zero,ymm12[27,20,21,26],zero,ymm12[24],zero,ymm12[26,27,26,27],zero,ymm12[25] +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27],zero ; AVX2-NEXT: vpor %ymm1, %ymm7, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27] +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero,zero,ymm13[27] ; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0] ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm13[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm13[25],zero,ymm13[23],zero,zero,zero,zero,ymm13[26],zero,ymm13[24],zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm5[25],zero,ymm5[23],zero,zero,zero,zero,ymm5[26],zero,ymm5[24],zero,zero,zero ; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3] -; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] +; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25] ; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3] ; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u] ; AVX2-NEXT: vpblendvb %ymm11, %ymm7, %ymm8, %ymm7 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] ; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm7, %ymm1 -; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,zero,zero,ymm6[18] -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[14],zero,zero,zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,zero,zero,ymm4[18],zero -; AVX2-NEXT: vpor %ymm6, %ymm4, %ymm4 -; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero -; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero -; AVX2-NEXT: vpor %ymm5, %ymm3, %ymm3 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] -; AVX2-NEXT: vpblendvb %ymm5, %ymm4, %ymm3, %ymm3 -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,ymm13[1,2,3,0,1,14],zero,ymm13[0,1,0,1,14,15],zero,ymm13[15,16,17,18,19,16],zero,ymm13[30,31,16,17,16,17],zero,ymm13[31,30,31] -; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero -; AVX2-NEXT: vpor %ymm4, %ymm2, %ymm2 -; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm12[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] -; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm2 -; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-NEXT: vpblendvb %ymm4, %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,ymm5[1,2,3,0,1,14],zero,ymm5[0,1,0,1,14,15],zero,ymm5[15,16,17,18,19,16],zero,ymm5[30,31,16,17,16,17],zero,ymm5[31,30,31] +; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[13],zero,zero,zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,zero,zero,ymm6[17],zero,zero,zero +; AVX2-NEXT: vpor %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] +; AVX2-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm4 +; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,ymm12[14],zero,zero,zero,zero,zero,zero,ymm12[15],zero,zero,zero,zero,zero,zero,ymm12[16],zero,zero,zero,zero,zero,zero,ymm12[17],zero,zero,zero,zero,zero,zero,ymm12[18],zero +; AVX2-NEXT: vpor %ymm3, %ymm5, %ymm3 +; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero +; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,ymm13[14],zero,zero,zero,zero,zero,zero,ymm13[15],zero,zero,zero,zero,zero,zero,ymm13[16],zero,zero,zero,zero,zero,zero,ymm13[17],zero,zero,zero,zero,zero,zero +; AVX2-NEXT: vpor %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2 +; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-NEXT: vpblendvb %ymm3, %ymm2, %ymm4, %ymm2 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: vmovdqa %ymm2, 96(%rax) ; AVX2-NEXT: vmovdqa %ymm1, 160(%rax) @@ -3905,22 +3882,21 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 ; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0] ; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero +; AVX2-FP-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] +; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero ; AVX2-FP-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero ; AVX2-FP-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero -; AVX2-FP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] -; AVX2-FP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-FP-NEXT: vpblendvb %ymm1, %ymm0, %ymm6, %ymm0 ; AVX2-FP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FP-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FP-NEXT: vmovdqa %ymm12, 128(%rax) @@ -4067,22 +4043,21 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpblendvb %ymm15, %ymm13, %ymm14, %ymm13 ; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm14 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255] ; AVX2-FCP-NEXT: vpblendvb %ymm14, %ymm12, %ymm13, %ymm12 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero +; AVX2-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6 +; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] +; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero,ymm3[18] ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,ymm1[14],zero,zero,zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,zero,zero,ymm1[18],zero ; AVX2-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1 ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero ; AVX2-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31] -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero -; AVX2-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u] -; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1 -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] -; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm1 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255] +; AVX2-FCP-NEXT: vpblendvb %ymm1, %ymm0, %ymm6, %ymm0 ; AVX2-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-FCP-NEXT: vmovdqa %ymm0, 96(%rax) ; AVX2-FCP-NEXT: vmovdqa %ymm10, 128(%rax) diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll index a9da7abaa945c..3acc94d6e1fc4 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll @@ -2071,9 +2071,7 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpermq {{.*#+}} zmm9 = zmm3[0,2,0,2,4,6,4,6] ; AVX512BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63] ; AVX512BW-NEXT: vpshufb %zmm10, %zmm9, %zmm9 -; AVX512BW-NEXT: movw $-21846, %cx # imm = 0xAAAA -; AVX512BW-NEXT: kmovd %ecx, %k1 -; AVX512BW-NEXT: vpord %zmm6, %zmm9, %zmm4 {%k1} +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | zmm4 | zmm6 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] ; AVX512BW-NEXT: vpshufb %zmm5, %zmm0, %zmm0 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7] @@ -2083,9 +2081,9 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-NEXT: vpshufb %zmm8, %zmm1, %zmm1 ; AVX512BW-NEXT: vpermq {{.*#+}} zmm2 = zmm3[1,3,1,3,5,7,5,7] ; AVX512BW-NEXT: vpshufb %zmm10, %zmm2, %zmm2 -; AVX512BW-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512BW-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1 +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -2117,23 +2115,21 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm4 ; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63] ; AVX512BW-FCP-NEXT: vpshufb %zmm10, %zmm4, %zmm4 -; AVX512BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA -; AVX512BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512BW-FCP-NEXT: vpord %zmm7, %zmm4, %zmm5 {%k1} -; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,3,1,3,1,3,1,3] -; AVX512BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm0 +; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | zmm5 | zmm7 +; AVX512BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3] +; AVX512BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512BW-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm0 ; AVX512BW-FCP-NEXT: vpshufb %zmm6, %zmm0, %zmm0 -; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpermq %zmm1, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpshufb %zmm8, %zmm1, %zmm1 ; AVX512BW-FCP-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm4, %zmm1 +; AVX512BW-FCP-NEXT: vpermq %zmm2, %zmm5, %zmm1 ; AVX512BW-FCP-NEXT: vpshufb %zmm9, %zmm1, %zmm1 -; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm2 +; AVX512BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm2 ; AVX512BW-FCP-NEXT: vpshufb %zmm10, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -2167,9 +2163,7 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm9 = zmm3[0,2,0,2,4,6,4,6] ; AVX512DQ-BW-NEXT: vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63] ; AVX512DQ-BW-NEXT: vpshufb %zmm10, %zmm9, %zmm9 -; AVX512DQ-BW-NEXT: movw $-21846, %cx # imm = 0xAAAA -; AVX512DQ-BW-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-NEXT: vpord %zmm6, %zmm9, %zmm4 {%k1} +; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} zmm9 = zmm9 | zmm4 | zmm6 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[1,3,1,3,5,7,5,7] ; AVX512DQ-BW-NEXT: vpshufb %zmm5, %zmm0, %zmm0 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[1,3,1,3,5,7,5,7] @@ -2179,9 +2173,9 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-NEXT: vpshufb %zmm8, %zmm1, %zmm1 ; AVX512DQ-BW-NEXT: vpermq {{.*#+}} zmm2 = zmm3[1,3,1,3,5,7,5,7] ; AVX512DQ-BW-NEXT: vpshufb %zmm10, %zmm2, %zmm2 -; AVX512DQ-BW-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm4, (%rax) +; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, (%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -2213,23 +2207,21 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm4 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm10 = [128,128,128,128,128,128,0,8,128,128,128,128,128,128,1,9,128,128,128,128,128,128,18,26,128,128,128,128,128,128,19,27,128,128,128,128,128,128,36,44,128,128,128,128,128,128,37,45,128,128,128,128,128,128,54,62,128,128,128,128,128,128,55,63] ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm10, %zmm4, %zmm4 -; AVX512DQ-BW-FCP-NEXT: movw $-21846, %cx # imm = 0xAAAA -; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1 -; AVX512DQ-BW-FCP-NEXT: vpord %zmm7, %zmm4, %zmm5 {%k1} -; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm4 = [1,3,1,3,1,3,1,3] -; AVX512DQ-BW-FCP-NEXT: # zmm4 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm4, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} zmm4 = zmm4 | zmm5 | zmm7 +; AVX512DQ-BW-FCP-NEXT: vbroadcasti32x4 {{.*#+}} zmm5 = [1,3,1,3,1,3,1,3] +; AVX512DQ-BW-FCP-NEXT: # zmm5 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3] +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm0, %zmm5, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm6, %zmm0, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm1, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm8, %zmm1, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vporq %zmm0, %zmm1, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm4, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm2, %zmm5, %zmm1 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm9, %zmm1, %zmm1 -; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm4, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpermq %zmm3, %zmm5, %zmm2 ; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm10, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vpord %zmm1, %zmm2, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} zmm2 = zmm2 | zmm0 | zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm4, (%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <16 x i8>, ptr %in.vecptr0, align 64 @@ -8050,128 +8042,107 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW: # %bb.0: ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm16 -; AVX512BW-NEXT: vmovdqa 48(%rcx), %xmm14 -; AVX512BW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm18 -; AVX512BW-NEXT: vmovdqa64 48(%rdx), %xmm17 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512BW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm20 -; AVX512BW-NEXT: vmovdqa64 48(%rsi), %xmm19 -; AVX512BW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm21 -; AVX512BW-NEXT: vmovdqa64 48(%rdi), %xmm22 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512BW-NEXT: movl $572662306, %r11d # imm = 0x22222222 -; AVX512BW-NEXT: kmovd %r11d, %k1 -; AVX512BW-NEXT: vpermw %zmm4, %zmm6, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa (%r10), %xmm4 -; AVX512BW-NEXT: vmovdqa64 48(%r10), %xmm23 -; AVX512BW-NEXT: vmovdqa (%rax), %xmm7 -; AVX512BW-NEXT: vmovdqa64 48(%rax), %xmm24 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; AVX512BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512BW-NEXT: vmovdqa64 48(%r9), %xmm25 -; AVX512BW-NEXT: vmovdqa (%r8), %xmm10 -; AVX512BW-NEXT: vmovdqa64 48(%r8), %xmm26 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] -; AVX512BW-NEXT: vpermw %zmm11, %zmm12, %zmm11 -; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] -; AVX512BW-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 -; AVX512BW-NEXT: kmovd %r11d, %k2 -; AVX512BW-NEXT: vpermw %zmm9, %zmm13, %zmm11 {%k2} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3],xmm17[4],xmm14[4],xmm17[5],xmm14[5],xmm17[6],xmm14[6],xmm17[7],xmm14[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero -; AVX512BW-NEXT: vpermw %zmm15, %zmm6, %zmm9 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512BW-NEXT: vpermw %zmm15, %zmm12, %zmm15 -; AVX512BW-NEXT: vpermw %zmm27, %zmm13, %zmm15 {%k2} -; AVX512BW-NEXT: vmovdqa64 32(%r10), %xmm27 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] -; AVX512BW-NEXT: vmovdqa64 32(%rax), %xmm28 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15] -; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm29 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero,xmm14[4],zero,zero,zero,xmm14[5],zero,zero,zero,xmm14[6],zero,zero,zero,xmm14[7],zero,zero,zero -; AVX512BW-NEXT: vpermw %zmm17, %zmm6, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm30 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] -; AVX512BW-NEXT: vpermw %zmm19, %zmm12, %zmm19 -; AVX512BW-NEXT: vpermw %zmm17, %zmm13, %zmm19 {%k2} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero,xmm17[4],zero,zero,zero,xmm17[5],zero,zero,zero,xmm17[6],zero,zero,zero,xmm17[7],zero,zero,zero -; AVX512BW-NEXT: vpermw %zmm22, %zmm6, %zmm17 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7] -; AVX512BW-NEXT: vpermw %zmm22, %zmm12, %zmm22 -; AVX512BW-NEXT: vpermw %zmm23, %zmm13, %zmm22 {%k2} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] -; AVX512BW-NEXT: vmovdqa64 16(%rcx), %xmm23 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] -; AVX512BW-NEXT: vmovdqa64 16(%rdx), %xmm21 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero,xmm16[4],zero,zero,zero,xmm16[5],zero,zero,zero,xmm16[6],zero,zero,zero,xmm16[7],zero,zero,zero -; AVX512BW-NEXT: vpermw %zmm18, %zmm6, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa64 16(%rsi), %xmm24 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] -; AVX512BW-NEXT: vmovdqa64 16(%rdi), %xmm25 -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15] -; AVX512BW-NEXT: vpermw %zmm18, %zmm12, %zmm18 -; AVX512BW-NEXT: vpermw %zmm20, %zmm13, %zmm18 {%k2} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm21[0],xmm23[0],xmm21[1],xmm23[1],xmm21[2],xmm23[2],xmm21[3],xmm23[3],xmm21[4],xmm23[4],xmm21[5],xmm23[5],xmm21[6],xmm23[6],xmm21[7],xmm23[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7] -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero,xmm20[2],zero,zero,zero,xmm20[3],zero,zero,zero,xmm20[4],zero,zero,zero,xmm20[5],zero,zero,zero,xmm20[6],zero,zero,zero,xmm20[7],zero,zero,zero -; AVX512BW-NEXT: vpermw %zmm26, %zmm6, %zmm20 {%k1} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm21[8],xmm23[8],xmm21[9],xmm23[9],xmm21[10],xmm23[10],xmm21[11],xmm23[11],xmm21[12],xmm23[12],xmm21[13],xmm23[13],xmm21[14],xmm23[14],xmm21[15],xmm23[15] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15] -; AVX512BW-NEXT: vmovdqa64 16(%r10), %xmm24 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero,xmm23[2],zero,zero,zero,xmm23[3],zero,zero,zero,xmm23[4],zero,zero,zero,xmm23[5],zero,zero,zero,xmm23[6],zero,zero,zero,xmm23[7],zero,zero,zero -; AVX512BW-NEXT: vpermw %zmm21, %zmm6, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa64 16(%rax), %xmm21 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512BW-NEXT: vmovdqa 16(%r9), %xmm2 -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512BW-NEXT: vmovdqa 16(%r8), %xmm5 -; AVX512BW-NEXT: vpmovzxwq {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX512BW-NEXT: vpermw %zmm1, %zmm6, %zmm3 {%k1} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm21[0],xmm24[0],xmm21[1],xmm24[1],xmm21[2],xmm24[2],xmm21[3],xmm24[3],xmm21[4],xmm24[4],xmm21[5],xmm24[5],xmm21[6],xmm24[6],xmm21[7],xmm24[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX512BW-NEXT: vpermw %zmm6, %zmm12, %zmm6 -; AVX512BW-NEXT: vpermw %zmm1, %zmm13, %zmm6 {%k2} -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm21[8],xmm24[8],xmm21[9],xmm24[9],xmm21[10],xmm24[10],xmm21[11],xmm24[11],xmm21[12],xmm24[12],xmm21[13],xmm24[13],xmm21[14],xmm24[14],xmm21[15],xmm24[15] -; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; AVX512BW-NEXT: vpermw %zmm2, %zmm12, %zmm2 -; AVX512BW-NEXT: vpermw %zmm1, %zmm13, %zmm2 {%k2} -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512BW-NEXT: vpermw %zmm4, %zmm12, %zmm4 -; AVX512BW-NEXT: vpermw %zmm1, %zmm13, %zmm4 {%k2} +; AVX512BW-NEXT: vmovdqa (%r10), %xmm1 +; AVX512BW-NEXT: vmovdqa64 32(%r10), %xmm16 +; AVX512BW-NEXT: vmovdqa 48(%r10), %xmm14 +; AVX512BW-NEXT: vmovdqa (%rax), %xmm3 +; AVX512BW-NEXT: vmovdqa64 32(%rax), %xmm17 +; AVX512BW-NEXT: vmovdqa 48(%rax), %xmm15 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512BW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512BW-NEXT: vmovdqa64 32(%r9), %xmm19 +; AVX512BW-NEXT: vmovdqa64 48(%r9), %xmm18 +; AVX512BW-NEXT: vmovdqa (%r8), %xmm6 +; AVX512BW-NEXT: vmovdqa64 32(%r8), %xmm21 +; AVX512BW-NEXT: vmovdqa64 48(%r8), %xmm20 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512BW-NEXT: vpermt2w %zmm2, %zmm7, %zmm0 +; AVX512BW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512BW-NEXT: vmovdqa64 48(%rcx), %xmm22 +; AVX512BW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512BW-NEXT: vmovdqa64 48(%rdx), %xmm23 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; AVX512BW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512BW-NEXT: vmovdqa64 48(%rsi), %xmm24 +; AVX512BW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512BW-NEXT: vmovdqa64 48(%rdi), %xmm25 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm2 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3],xmm20[4],xmm18[4],xmm20[5],xmm18[5],xmm20[6],xmm18[6],xmm20[7],xmm18[7] +; AVX512BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm13 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7] +; AVX512BW-NEXT: vpermt2w %zmm26, %zmm12, %zmm11 +; AVX512BW-NEXT: vmovdqa64 32(%rcx), %xmm26 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512BW-NEXT: vmovdqa64 32(%rdx), %xmm27 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm20[8],xmm18[8],xmm20[9],xmm18[9],xmm20[10],xmm18[10],xmm20[11],xmm18[11],xmm20[12],xmm18[12],xmm20[13],xmm18[13],xmm20[14],xmm18[14],xmm20[15],xmm18[15] +; AVX512BW-NEXT: vmovdqa64 32(%rsi), %xmm28 +; AVX512BW-NEXT: vpermt2w %zmm14, %zmm7, %zmm15 +; AVX512BW-NEXT: vmovdqa64 32(%rdi), %xmm29 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15] +; AVX512BW-NEXT: vpermt2w %zmm18, %zmm12, %zmm14 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7] +; AVX512BW-NEXT: vpermt2w %zmm18, %zmm7, %zmm20 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] +; AVX512BW-NEXT: vpermt2w %zmm22, %zmm12, %zmm18 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512BW-NEXT: vmovdqa64 16(%r10), %xmm22 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] +; AVX512BW-NEXT: vmovdqa64 16(%rax), %xmm19 +; AVX512BW-NEXT: vpermt2w %zmm17, %zmm7, %zmm16 +; AVX512BW-NEXT: vmovdqa64 16(%r9), %xmm21 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] +; AVX512BW-NEXT: vmovdqa64 16(%r8), %xmm24 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm12, %zmm17 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7] +; AVX512BW-NEXT: vpermt2w %zmm23, %zmm7, %zmm25 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15] +; AVX512BW-NEXT: vmovdqa64 16(%rcx), %xmm22 +; AVX512BW-NEXT: vpermt2w %zmm19, %zmm7, %zmm21 +; AVX512BW-NEXT: vmovdqa64 16(%rdx), %xmm19 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512BW-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512BW-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm7, %zmm4 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm7 +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15] +; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm3 +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm5 ; AVX512BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-NEXT: kmovd %eax, %k1 -; AVX512BW-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm19, %zmm14 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm22, %zmm17 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm18, %zmm16 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm6, %zmm20 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm2, %zmm23 {%k1} -; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm13, %zmm11 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm16, %zmm17 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm25, %zmm7 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm21, %zmm3 {%k1} +; AVX512BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} ; AVX512BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm18, 256(%rax) ; AVX512BW-NEXT: vmovdqa64 %zmm14, 448(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm11, 384(%rax) +; AVX512BW-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512BW-NEXT: vzeroupper ; AVX512BW-NEXT: retq ; @@ -8179,172 +8150,173 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512BW-FCP: # %bb.0: ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512BW-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 48(%rsi), %xmm17 +; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 48(%rsi), %xmm16 ; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm21 -; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm18 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm20 +; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm17 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798] -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm5 = [2312,2826,3340,3854] +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm6 = [1284,1798] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm4 ; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm22 -; AVX512BW-FCP-NEXT: vmovdqa64 48(%rcx), %xmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm21 +; AVX512BW-FCP-NEXT: vmovdqa64 48(%rcx), %xmm18 ; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm23 -; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdx), %xmm24 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512BW-FCP-NEXT: movl $572662306, %r11d # imm = 0x22222222 -; AVX512BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512BW-FCP-NEXT: vpermw %zmm6, %zmm8, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa (%r10), %xmm6 -; AVX512BW-FCP-NEXT: vmovdqa64 48(%r10), %xmm25 -; AVX512BW-FCP-NEXT: vmovdqa (%rax), %xmm9 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm22 +; AVX512BW-FCP-NEXT: vmovdqa64 48(%rdx), %xmm23 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,32,2,3,4,33,6,7,8,42,10,11,12,43,14,15,16,36,18,19,20,37,22,23,24,46,26,27,28,47,30,31] +; AVX512BW-FCP-NEXT: vpermt2w %zmm4, %zmm9, %zmm0 +; AVX512BW-FCP-NEXT: vmovdqa (%r10), %xmm4 +; AVX512BW-FCP-NEXT: vmovdqa64 48(%r10), %xmm24 +; AVX512BW-FCP-NEXT: vmovdqa (%rax), %xmm8 ; AVX512BW-FCP-NEXT: vmovdqa64 48(%rax), %xmm26 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm10, %ymm13 ; AVX512BW-FCP-NEXT: vmovdqa (%r9), %xmm10 -; AVX512BW-FCP-NEXT: vmovdqa64 48(%r9), %xmm27 -; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm11 -; AVX512BW-FCP-NEXT: vmovdqa64 48(%r8), %xmm28 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] -; AVX512BW-FCP-NEXT: vpermw %zmm12, %zmm13, %zmm12 -; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] -; AVX512BW-FCP-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 -; AVX512BW-FCP-NEXT: kmovd %r11d, %k2 -; AVX512BW-FCP-NEXT: vpermw %zmm15, %zmm14, %zmm12 {%k2} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm15, %ymm15, %ymm16 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm16, %ymm16 -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm29 -; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm29, %ymm15, %ymm15 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7] -; AVX512BW-FCP-NEXT: vpermw %zmm16, %zmm8, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm29 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7] -; AVX512BW-FCP-NEXT: vpermw %zmm16, %zmm13, %zmm16 -; AVX512BW-FCP-NEXT: vpermw %zmm29, %zmm14, %zmm16 {%k2} -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm17, %xmm18 -; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm29, %ymm18 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%r10), %xmm29 +; AVX512BW-FCP-NEXT: vmovdqa64 48(%r9), %xmm28 +; AVX512BW-FCP-NEXT: vmovdqa (%r8), %xmm12 +; AVX512BW-FCP-NEXT: vmovdqa64 48(%r8), %xmm30 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,1,0,32,4,5,1,33,2,1,2,42,4,5,3,43,0,1,4,36,4,5,5,37,0,1,6,46,6,5,7,47] +; AVX512BW-FCP-NEXT: vpermt2w %zmm13, %zmm14, %zmm11 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm15 +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm25 +; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm13, %ymm13 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm13 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm23[0],xmm18[0],xmm23[1],xmm18[1],xmm23[2],xmm18[2],xmm23[3],xmm18[3],xmm23[4],xmm18[4],xmm23[5],xmm18[5],xmm23[6],xmm18[6],xmm23[7],xmm18[7] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 +; AVX512BW-FCP-NEXT: vpermt2w %zmm15, %zmm9, %zmm13 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm15, %ymm15, %ymm25 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm14, %zmm15 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm16, %xmm17 +; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm25, %ymm17 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%r10), %xmm25 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm16, %ymm16 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%rax), %xmm29 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15] +; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm27 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm17, %ymm17 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 -; AVX512BW-FCP-NEXT: vmovdqa64 32(%rax), %xmm30 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15] -; AVX512BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm31 -; AVX512BW-FCP-NEXT: vpermw %zmm18, %zmm8, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] -; AVX512BW-FCP-NEXT: vpermw %zmm18, %zmm13, %zmm18 -; AVX512BW-FCP-NEXT: vpermw %zmm19, %zmm14, %zmm18 {%k2} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm24 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm24, %ymm24 -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm19, %xmm25 -; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm19, %ymm19 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm24, %zmm19, %zmm19 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] -; AVX512BW-FCP-NEXT: vpermw %zmm24, %zmm8, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm0[0],xmm31[0],xmm0[1],xmm31[1],xmm0[2],xmm31[2],xmm0[3],xmm31[3],xmm0[4],xmm31[4],xmm0[5],xmm31[5],xmm0[6],xmm31[6],xmm0[7],xmm31[7] -; AVX512BW-FCP-NEXT: vpermw %zmm24, %zmm13, %zmm24 -; AVX512BW-FCP-NEXT: vpermw %zmm25, %zmm14, %zmm24 {%k2} -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm20, %xmm21 -; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm25, %ymm21 +; AVX512BW-FCP-NEXT: vpermt2w %zmm17, %zmm9, %zmm16 +; AVX512BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm31 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm18 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm18, %zmm14, %zmm17 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm23 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm23, %ymm23 +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm18, %xmm24 +; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm24, %ymm18, %ymm18 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm18, %zmm18 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm23, %ymm23 +; AVX512BW-FCP-NEXT: vpermt2w %zmm23, %zmm9, %zmm18 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm29[0],xmm25[0],xmm29[1],xmm25[1],xmm29[2],xmm25[2],xmm29[3],xmm25[3],xmm29[4],xmm25[4],xmm29[5],xmm25[5],xmm29[6],xmm25[6],xmm29[7],xmm25[7] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm23, %ymm24 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm31[0],xmm27[0],xmm31[1],xmm27[1],xmm31[2],xmm27[2],xmm31[3],xmm27[3],xmm31[4],xmm27[4],xmm31[5],xmm27[5],xmm31[6],xmm27[6],xmm31[7],xmm27[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm14, %zmm23 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm19, %xmm20 +; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm24, %ymm20 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm19 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm19, %ymm19 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm20, %zmm19 +; AVX512BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm24 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] +; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm22 ; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm20, %ymm20 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm21, %zmm20 -; AVX512BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm25 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] -; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm23 -; AVX512BW-FCP-NEXT: vpermw %zmm21, %zmm8, %zmm20 {%k1} +; AVX512BW-FCP-NEXT: vpermt2w %zmm20, %zmm9, %zmm19 ; AVX512BW-FCP-NEXT: vmovdqa64 16(%rcx), %xmm26 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm31[8],xmm0[9],xmm31[9],xmm0[10],xmm31[10],xmm0[11],xmm31[11],xmm0[12],xmm31[12],xmm0[13],xmm31[13],xmm0[14],xmm31[14],xmm0[15],xmm31[15] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm13, %zmm21 -; AVX512BW-FCP-NEXT: vpermw %zmm22, %zmm14, %zmm21 {%k2} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7] -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm22 -; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm27, %ymm22 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm25[8],xmm29[9],xmm25[9],xmm29[10],xmm25[10],xmm29[11],xmm25[11],xmm29[12],xmm25[12],xmm29[13],xmm25[13],xmm29[14],xmm25[14],xmm29[15],xmm25[15] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm21 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm31[8],xmm27[8],xmm31[9],xmm27[9],xmm31[10],xmm27[10],xmm31[11],xmm27[11],xmm31[12],xmm27[12],xmm31[13],xmm27[13],xmm31[14],xmm27[14],xmm31[15],xmm27[15] +; AVX512BW-FCP-NEXT: vpermt2w %zmm21, %zmm14, %zmm20 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm22[0],xmm24[0],xmm22[1],xmm24[1],xmm22[2],xmm24[2],xmm22[3],xmm24[3],xmm22[4],xmm24[4],xmm22[5],xmm24[5],xmm22[6],xmm24[6],xmm22[7],xmm24[7] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm21, %xmm25 +; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm27, %ymm25 ; AVX512BW-FCP-NEXT: vmovdqa64 16(%rdx), %xmm27 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm22, %zmm22 -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] -; AVX512BW-FCP-NEXT: vpermw %zmm0, %zmm8, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15] -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm23 -; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm25, %ymm23 -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm23, %zmm0 -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] -; AVX512BW-FCP-NEXT: vpermw %zmm23, %zmm8, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX512BW-FCP-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm2, %ymm23 -; AVX512BW-FCP-NEXT: vpshufb %ymm4, %ymm23, %ymm4 -; AVX512BW-FCP-NEXT: vmovdqa64 16(%r10), %xmm23 -; AVX512BW-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX512BW-FCP-NEXT: vmovdqa 16(%rax), %xmm5 -; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512BW-FCP-NEXT: vmovdqa 16(%r9), %xmm4 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm21, %ymm21 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm25, %zmm21 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm25, %ymm25 +; AVX512BW-FCP-NEXT: vpermt2w %zmm25, %zmm9, %zmm21 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm24[8],xmm22[9],xmm24[9],xmm22[10],xmm24[10],xmm22[11],xmm24[11],xmm22[12],xmm24[12],xmm22[13],xmm24[13],xmm22[14],xmm24[14],xmm22[15],xmm24[15] +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm22, %xmm24 +; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm24, %ymm25, %ymm24 +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm22, %ymm22 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm22, %ymm22 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm24, %zmm22 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] +; AVX512BW-FCP-NEXT: vinserti32x4 $1, %xmm24, %ymm24, %ymm24 +; AVX512BW-FCP-NEXT: vpermt2w %zmm24, %zmm9, %zmm22 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 +; AVX512BW-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512BW-FCP-NEXT: vmovdqa 16(%r10), %xmm5 +; AVX512BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX512BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; AVX512BW-FCP-NEXT: vmovdqa 16(%rax), %xmm6 +; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512BW-FCP-NEXT: vmovdqa 16(%r9), %xmm2 ; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; AVX512BW-FCP-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm8, %zmm2 {%k1} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm23[0],xmm5[1],xmm23[1],xmm5[2],xmm23[2],xmm5[3],xmm23[3],xmm5[4],xmm23[4],xmm5[5],xmm23[5],xmm5[6],xmm23[6],xmm5[7],xmm23[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX512BW-FCP-NEXT: vpermw %zmm8, %zmm13, %zmm8 -; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm8 {%k2} -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm23[8],xmm5[9],xmm23[9],xmm5[10],xmm23[10],xmm5[11],xmm23[11],xmm5[12],xmm23[12],xmm5[13],xmm23[13],xmm5[14],xmm23[14],xmm5[15],xmm23[15] -; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; AVX512BW-FCP-NEXT: vpermw %zmm4, %zmm13, %zmm4 -; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm4 {%k2} -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512BW-FCP-NEXT: vpermw %zmm5, %zmm13, %zmm5 -; AVX512BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm5 {%k2} +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm9, %zmm1 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm9 +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm2 +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm4 ; AVX512BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm19 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm22 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} -; AVX512BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm16 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm21 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm22 {%k1} +; AVX512BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} ; AVX512BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512BW-FCP-NEXT: vzeroupper ; AVX512BW-FCP-NEXT: retq ; @@ -8352,128 +8324,107 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW: # %bb.0: ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm1 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm16 -; AVX512DQ-BW-NEXT: vmovdqa 48(%rcx), %xmm14 -; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm2 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm18 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdx), %xmm17 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] -; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm3 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm20 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rsi), %xmm19 -; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm5 -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm21 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdi), %xmm22 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm6 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512DQ-BW-NEXT: movl $572662306, %r11d # imm = 0x22222222 -; AVX512DQ-BW-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-NEXT: vpermw %zmm4, %zmm6, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm4 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%r10), %xmm23 -; AVX512DQ-BW-NEXT: vmovdqa (%rax), %xmm7 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%rax), %xmm24 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm9 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm8 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%r9), %xmm25 -; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm10 -; AVX512DQ-BW-NEXT: vmovdqa64 48(%r8), %xmm26 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15] -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] -; AVX512DQ-BW-NEXT: vpermw %zmm11, %zmm12, %zmm11 -; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] -; AVX512DQ-BW-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 -; AVX512DQ-BW-NEXT: kmovd %r11d, %k2 -; AVX512DQ-BW-NEXT: vpermw %zmm9, %zmm13, %zmm11 {%k2} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm17[0],xmm14[0],xmm17[1],xmm14[1],xmm17[2],xmm14[2],xmm17[3],xmm14[3],xmm17[4],xmm14[4],xmm17[5],xmm14[5],xmm17[6],xmm14[6],xmm17[7],xmm14[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm22[0],xmm19[0],xmm22[1],xmm19[1],xmm22[2],xmm19[2],xmm22[3],xmm19[3],xmm22[4],xmm19[4],xmm22[5],xmm19[5],xmm22[6],xmm19[6],xmm22[7],xmm19[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm9 = xmm9[0],zero,zero,zero,xmm9[1],zero,zero,zero,xmm9[2],zero,zero,zero,xmm9[3],zero,zero,zero,xmm9[4],zero,zero,zero,xmm9[5],zero,zero,zero,xmm9[6],zero,zero,zero,xmm9[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpermw %zmm15, %zmm6, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm27 = xmm24[0],xmm23[0],xmm24[1],xmm23[1],xmm24[2],xmm23[2],xmm24[3],xmm23[3],xmm24[4],xmm23[4],xmm24[5],xmm23[5],xmm24[6],xmm23[6],xmm24[7],xmm23[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512DQ-BW-NEXT: vpermw %zmm15, %zmm12, %zmm15 -; AVX512DQ-BW-NEXT: vpermw %zmm27, %zmm13, %zmm15 {%k2} -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r10), %xmm27 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm14[8],xmm17[9],xmm14[9],xmm17[10],xmm14[10],xmm17[11],xmm14[11],xmm17[12],xmm14[12],xmm17[13],xmm14[13],xmm17[14],xmm14[14],xmm17[15],xmm14[15] -; AVX512DQ-BW-NEXT: vmovdqa64 32(%rax), %xmm28 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm22[8],xmm19[8],xmm22[9],xmm19[9],xmm22[10],xmm19[10],xmm22[11],xmm19[11],xmm22[12],xmm19[12],xmm22[13],xmm19[13],xmm22[14],xmm19[14],xmm22[15],xmm19[15] -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm29 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm14 = xmm14[0],zero,zero,zero,xmm14[1],zero,zero,zero,xmm14[2],zero,zero,zero,xmm14[3],zero,zero,zero,xmm14[4],zero,zero,zero,xmm14[5],zero,zero,zero,xmm14[6],zero,zero,zero,xmm14[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpermw %zmm17, %zmm6, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm30 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm24[8],xmm23[8],xmm24[9],xmm23[9],xmm24[10],xmm23[10],xmm24[11],xmm23[11],xmm24[12],xmm23[12],xmm24[13],xmm23[13],xmm24[14],xmm23[14],xmm24[15],xmm23[15] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] -; AVX512DQ-BW-NEXT: vpermw %zmm19, %zmm12, %zmm19 -; AVX512DQ-BW-NEXT: vpermw %zmm17, %zmm13, %zmm19 {%k2} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm18[0],xmm16[0],xmm18[1],xmm16[1],xmm18[2],xmm16[2],xmm18[3],xmm16[3],xmm18[4],xmm16[4],xmm18[5],xmm16[5],xmm18[6],xmm16[6],xmm18[7],xmm16[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm17 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm17 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero,xmm17[2],zero,zero,zero,xmm17[3],zero,zero,zero,xmm17[4],zero,zero,zero,xmm17[5],zero,zero,zero,xmm17[6],zero,zero,zero,xmm17[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpermw %zmm22, %zmm6, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7] -; AVX512DQ-BW-NEXT: vpermw %zmm22, %zmm12, %zmm22 -; AVX512DQ-BW-NEXT: vpermw %zmm23, %zmm13, %zmm22 {%k2} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15] -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rcx), %xmm23 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdx), %xmm21 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm16 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero,xmm16[2],zero,zero,zero,xmm16[3],zero,zero,zero,xmm16[4],zero,zero,zero,xmm16[5],zero,zero,zero,xmm16[6],zero,zero,zero,xmm16[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpermw %zmm18, %zmm6, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rsi), %xmm24 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdi), %xmm25 -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15] -; AVX512DQ-BW-NEXT: vpermw %zmm18, %zmm12, %zmm18 -; AVX512DQ-BW-NEXT: vpermw %zmm20, %zmm13, %zmm18 {%k2} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm21[0],xmm23[0],xmm21[1],xmm23[1],xmm21[2],xmm23[2],xmm21[3],xmm23[3],xmm21[4],xmm23[4],xmm21[5],xmm23[5],xmm21[6],xmm23[6],xmm21[7],xmm23[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7] -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm20 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero,xmm20[2],zero,zero,zero,xmm20[3],zero,zero,zero,xmm20[4],zero,zero,zero,xmm20[5],zero,zero,zero,xmm20[6],zero,zero,zero,xmm20[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpermw %zmm26, %zmm6, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm21[8],xmm23[8],xmm21[9],xmm23[9],xmm21[10],xmm23[10],xmm21[11],xmm23[11],xmm21[12],xmm23[12],xmm21[13],xmm23[13],xmm21[14],xmm23[14],xmm21[15],xmm23[15] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15] -; AVX512DQ-BW-NEXT: vmovdqa64 16(%r10), %xmm24 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm23 = xmm23[0],zero,zero,zero,xmm23[1],zero,zero,zero,xmm23[2],zero,zero,zero,xmm23[3],zero,zero,zero,xmm23[4],zero,zero,zero,xmm23[5],zero,zero,zero,xmm23[6],zero,zero,zero,xmm23[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpermw %zmm21, %zmm6, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa64 16(%rax), %xmm21 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; AVX512DQ-BW-NEXT: vmovdqa 16(%r9), %xmm2 -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3],xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7] -; AVX512DQ-BW-NEXT: vmovdqa 16(%r8), %xmm5 -; AVX512DQ-BW-NEXT: vpmovzxwq {{.*#+}} zmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero,xmm3[2],zero,zero,zero,xmm3[3],zero,zero,zero,xmm3[4],zero,zero,zero,xmm3[5],zero,zero,zero,xmm3[6],zero,zero,zero,xmm3[7],zero,zero,zero -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm6, %zmm3 {%k1} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm21[0],xmm24[0],xmm21[1],xmm24[1],xmm21[2],xmm24[2],xmm21[3],xmm24[3],xmm21[4],xmm24[4],xmm21[5],xmm24[5],xmm21[6],xmm24[6],xmm21[7],xmm24[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7] -; AVX512DQ-BW-NEXT: vpermw %zmm6, %zmm12, %zmm6 -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm13, %zmm6 {%k2} -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm21[8],xmm24[8],xmm21[9],xmm24[9],xmm21[10],xmm24[10],xmm21[11],xmm24[11],xmm21[12],xmm24[12],xmm21[13],xmm24[13],xmm21[14],xmm24[14],xmm21[15],xmm24[15] -; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15] -; AVX512DQ-BW-NEXT: vpermw %zmm2, %zmm12, %zmm2 -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm13, %zmm2 {%k2} -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3],xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7] -; AVX512DQ-BW-NEXT: vpermw %zmm4, %zmm12, %zmm4 -; AVX512DQ-BW-NEXT: vpermw %zmm1, %zmm13, %zmm4 {%k2} +; AVX512DQ-BW-NEXT: vmovdqa (%r10), %xmm1 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%r10), %xmm16 +; AVX512DQ-BW-NEXT: vmovdqa 48(%r10), %xmm14 +; AVX512DQ-BW-NEXT: vmovdqa (%rax), %xmm3 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rax), %xmm17 +; AVX512DQ-BW-NEXT: vmovdqa 48(%rax), %xmm15 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; AVX512DQ-BW-NEXT: vmovdqa (%r9), %xmm4 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%r9), %xmm19 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%r9), %xmm18 +; AVX512DQ-BW-NEXT: vmovdqa (%r8), %xmm6 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%r8), %xmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%r8), %xmm20 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm7 = [0,1,0,32,4,5,1,33,2,1,2,34,4,5,3,35,0,1,4,36,4,5,5,37,0,1,6,38,6,5,7,39] +; AVX512DQ-BW-NEXT: vpermt2w %zmm2, %zmm7, %zmm0 +; AVX512DQ-BW-NEXT: vmovdqa (%rcx), %xmm5 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rcx), %xmm22 +; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm8 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdx), %xmm23 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15] +; AVX512DQ-BW-NEXT: vmovdqa (%rsi), %xmm9 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rsi), %xmm24 +; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm10 +; AVX512DQ-BW-NEXT: vmovdqa64 48(%rdi), %xmm25 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15] +; AVX512DQ-BW-NEXT: vpmovsxbw {{.*#+}} zmm12 = [0,32,0,0,1,33,0,0,2,34,0,0,3,35,0,0,4,36,0,0,5,37,0,0,6,38,0,0,7,39,0,0] +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm12, %zmm2 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm20[0],xmm18[0],xmm20[1],xmm18[1],xmm20[2],xmm18[2],xmm20[3],xmm18[3],xmm20[4],xmm18[4],xmm20[5],xmm18[5],xmm20[6],xmm18[6],xmm20[7],xmm18[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm11, %zmm7, %zmm13 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm26 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm11 = xmm25[0],xmm24[0],xmm25[1],xmm24[1],xmm25[2],xmm24[2],xmm25[3],xmm24[3],xmm25[4],xmm24[4],xmm25[5],xmm24[5],xmm25[6],xmm24[6],xmm25[7],xmm24[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm26, %zmm12, %zmm11 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rcx), %xmm26 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm15[8],xmm14[8],xmm15[9],xmm14[9],xmm15[10],xmm14[10],xmm15[11],xmm14[11],xmm15[12],xmm14[12],xmm15[13],xmm14[13],xmm15[14],xmm14[14],xmm15[15],xmm14[15] +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdx), %xmm27 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm20[8],xmm18[8],xmm20[9],xmm18[9],xmm20[10],xmm18[10],xmm20[11],xmm18[11],xmm20[12],xmm18[12],xmm20[13],xmm18[13],xmm20[14],xmm18[14],xmm20[15],xmm18[15] +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rsi), %xmm28 +; AVX512DQ-BW-NEXT: vpermt2w %zmm14, %zmm7, %zmm15 +; AVX512DQ-BW-NEXT: vmovdqa64 32(%rdi), %xmm29 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm14 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm18, %zmm12, %zmm14 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm20 = xmm21[0],xmm19[0],xmm21[1],xmm19[1],xmm21[2],xmm19[2],xmm21[3],xmm19[3],xmm21[4],xmm19[4],xmm21[5],xmm19[5],xmm21[6],xmm19[6],xmm21[7],xmm19[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm18, %zmm7, %zmm20 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm22 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm29[0],xmm28[0],xmm29[1],xmm28[1],xmm29[2],xmm28[2],xmm29[3],xmm28[3],xmm29[4],xmm28[4],xmm29[5],xmm28[5],xmm29[6],xmm28[6],xmm29[7],xmm28[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm22, %zmm12, %zmm18 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r10), %xmm22 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm21[8],xmm19[8],xmm21[9],xmm19[9],xmm21[10],xmm19[10],xmm21[11],xmm19[11],xmm21[12],xmm19[12],xmm21[13],xmm19[13],xmm21[14],xmm19[14],xmm21[15],xmm19[15] +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rax), %xmm19 +; AVX512DQ-BW-NEXT: vpermt2w %zmm17, %zmm7, %zmm16 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r9), %xmm21 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] +; AVX512DQ-BW-NEXT: vmovdqa64 16(%r8), %xmm24 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm29[8],xmm28[8],xmm29[9],xmm28[9],xmm29[10],xmm28[10],xmm29[11],xmm28[11],xmm29[12],xmm28[12],xmm29[13],xmm28[13],xmm29[14],xmm28[14],xmm29[15],xmm28[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm12, %zmm17 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm24[0],xmm21[0],xmm24[1],xmm21[1],xmm24[2],xmm21[2],xmm24[3],xmm21[3],xmm24[4],xmm21[4],xmm24[5],xmm21[5],xmm24[6],xmm21[6],xmm24[7],xmm21[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm23, %zmm7, %zmm25 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm24[8],xmm21[8],xmm24[9],xmm21[9],xmm24[10],xmm21[10],xmm24[11],xmm21[11],xmm24[12],xmm21[12],xmm24[13],xmm21[13],xmm24[14],xmm21[14],xmm24[15],xmm21[15] +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rcx), %xmm22 +; AVX512DQ-BW-NEXT: vpermt2w %zmm19, %zmm7, %zmm21 +; AVX512DQ-BW-NEXT: vmovdqa64 16(%rdx), %xmm19 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7] +; AVX512DQ-BW-NEXT: vmovdqa 16(%rsi), %xmm3 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; AVX512DQ-BW-NEXT: vmovdqa 16(%rdi), %xmm6 +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm7, %zmm4 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm19[0],xmm22[0],xmm19[1],xmm22[1],xmm19[2],xmm22[2],xmm19[3],xmm22[3],xmm19[4],xmm22[4],xmm19[5],xmm22[5],xmm19[6],xmm22[6],xmm19[7],xmm22[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm7 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3],xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm7 +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm19[8],xmm22[8],xmm19[9],xmm22[9],xmm19[10],xmm22[10],xmm19[11],xmm22[11],xmm19[12],xmm22[12],xmm19[13],xmm22[13],xmm19[14],xmm22[14],xmm19[15],xmm22[15] +; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm3[8],xmm6[9],xmm3[9],xmm6[10],xmm3[10],xmm6[11],xmm3[11],xmm6[12],xmm3[12],xmm6[13],xmm3[13],xmm6[14],xmm3[14],xmm6[15],xmm3[15] +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm3 +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7] +; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7] +; AVX512DQ-BW-NEXT: vpermt2w %zmm1, %zmm12, %zmm5 ; AVX512DQ-BW-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-BW-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm9 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm19, %zmm14 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm22, %zmm17 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm18, %zmm16 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm6, %zmm20 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm2, %zmm23 {%k1} -; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm0, %zmm2 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm13, %zmm11 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm15, %zmm14 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm20, %zmm18 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm16, %zmm17 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm25, %zmm7 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm21, %zmm3 {%k1} +; AVX512DQ-BW-NEXT: vmovdqa32 %zmm4, %zmm5 {%k1} ; AVX512DQ-BW-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, (%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm23, 192(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm20, 128(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm16, 320(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 256(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm3, 192(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm7, 128(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm17, 320(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm18, 256(%rax) ; AVX512DQ-BW-NEXT: vmovdqa64 %zmm14, 448(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm9, 384(%rax) -; AVX512DQ-BW-NEXT: vmovdqa64 %zmm0, 64(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm11, 384(%rax) +; AVX512DQ-BW-NEXT: vmovdqa64 %zmm2, 64(%rax) ; AVX512DQ-BW-NEXT: vzeroupper ; AVX512DQ-BW-NEXT: retq ; @@ -8481,172 +8432,173 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX512DQ-BW-FCP: # %bb.0: ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rsi), %xmm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rsi), %xmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rsi), %xmm16 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm21 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdi), %xmm20 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdi), %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15] ; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm4 = [2312,2826,3340,3854] -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm3, %ymm3 -; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm5 = [1284,1798] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} ymm5 = [2312,2826,3340,3854] +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpmovsxwq {{.*#+}} xmm6 = [1284,1798] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm0, %xmm4 ; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm0, %ymm0 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %xmm3 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm22 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rcx), %xmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rcx), %xmm21 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rcx), %xmm18 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm23 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdx), %xmm24 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm8 = [0,0,2,1,2,1,6,7,0,2,2,3,2,3,6,7,4,4,2,3,6,5,6,5,4,6,2,3,6,7,6,7] -; AVX512DQ-BW-FCP-NEXT: movl $572662306, %r11d # imm = 0x22222222 -; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k1 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm6, %zmm8, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %xmm6 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r10), %xmm25 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rax), %xmm9 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rdx), %xmm22 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rdx), %xmm23 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm3[8],xmm7[9],xmm3[9],xmm7[10],xmm3[10],xmm7[11],xmm3[11],xmm7[12],xmm3[12],xmm7[13],xmm3[13],xmm7[14],xmm3[14],xmm7[15],xmm3[15] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm4, %ymm4, %ymm4 +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm9 = [0,32,2,3,4,33,6,7,8,42,10,11,12,43,14,15,16,36,18,19,20,37,22,23,24,46,26,27,28,47,30,31] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm4, %zmm9, %zmm0 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r10), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r10), %xmm24 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rax), %xmm8 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%rax), %xmm26 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm15 = xmm9[8],xmm6[8],xmm9[9],xmm6[9],xmm9[10],xmm6[10],xmm9[11],xmm6[11],xmm9[12],xmm6[12],xmm9[13],xmm6[13],xmm9[14],xmm6[14],xmm9[15],xmm6[15] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm10 = xmm8[8],xmm4[8],xmm8[9],xmm4[9],xmm8[10],xmm4[10],xmm8[11],xmm4[11],xmm8[12],xmm4[12],xmm8[13],xmm4[13],xmm8[14],xmm4[14],xmm8[15],xmm4[15] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm10, %ymm10, %ymm13 ; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r9), %xmm10 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r9), %xmm27 -; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm11 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r8), %xmm28 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm12 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15] -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm13 = [0,1,0,1,4,5,1,3,2,1,2,1,4,5,3,3,0,1,4,5,4,5,5,7,0,1,6,5,6,5,7,7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm12, %zmm13, %zmm12 -; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,0,0,0,4,5,2,1,0,2,0,2,4,5,2,3,0,1,4,4,4,4,6,5,0,1,4,6,4,6,6,7] -; AVX512DQ-BW-FCP-NEXT: movl $-2004318072, %r11d # imm = 0x88888888 -; AVX512DQ-BW-FCP-NEXT: kmovd %r11d, %k2 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm15, %zmm14, %zmm12 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm18[0],xmm17[0],xmm18[1],xmm17[1],xmm18[2],xmm17[2],xmm18[3],xmm17[3],xmm18[4],xmm17[4],xmm18[5],xmm17[5],xmm18[6],xmm17[6],xmm18[7],xmm17[7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm15, %ymm15, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm16, %ymm16 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm15, %xmm29 -; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm15 = xmm15[0],zero,zero,zero,xmm15[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm29, %ymm15, %ymm15 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm15, %zmm15 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm24[0],xmm19[0],xmm24[1],xmm19[1],xmm24[2],xmm19[2],xmm24[3],xmm19[3],xmm24[4],xmm19[4],xmm24[5],xmm19[5],xmm24[6],xmm19[6],xmm24[7],xmm19[7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm16, %zmm8, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm29 = xmm26[0],xmm25[0],xmm26[1],xmm25[1],xmm26[2],xmm25[2],xmm26[3],xmm25[3],xmm26[4],xmm25[4],xmm26[5],xmm25[5],xmm26[6],xmm25[6],xmm26[7],xmm25[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm16 = xmm28[0],xmm27[0],xmm28[1],xmm27[1],xmm28[2],xmm27[2],xmm28[3],xmm27[3],xmm28[4],xmm27[4],xmm28[5],xmm27[5],xmm28[6],xmm27[6],xmm28[7],xmm27[7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm16, %zmm13, %zmm16 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm29, %zmm14, %zmm16 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm18[8],xmm17[8],xmm18[9],xmm17[9],xmm18[10],xmm17[10],xmm18[11],xmm17[11],xmm18[12],xmm17[12],xmm18[13],xmm17[13],xmm18[14],xmm17[14],xmm18[15],xmm17[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm17, %xmm18 -; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm29 = xmm17[0],zero,zero,zero,xmm17[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm29, %ymm18 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r10), %xmm29 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r9), %xmm28 +; AVX512DQ-BW-FCP-NEXT: vmovdqa (%r8), %xmm12 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 48(%r8), %xmm30 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm11 = xmm12[8],xmm10[8],xmm12[9],xmm10[9],xmm12[10],xmm10[10],xmm12[11],xmm10[11],xmm12[12],xmm10[12],xmm12[13],xmm10[13],xmm12[14],xmm10[14],xmm12[15],xmm10[15] +; AVX512DQ-BW-FCP-NEXT: vpmovsxbw {{.*#+}} zmm14 = [0,1,0,32,4,5,1,33,2,1,2,42,4,5,3,43,0,1,4,36,4,5,5,37,0,1,6,46,6,5,7,47] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm13, %zmm14, %zmm11 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm13 = xmm17[0],xmm16[0],xmm17[1],xmm16[1],xmm17[2],xmm16[2],xmm17[3],xmm16[3],xmm17[4],xmm16[4],xmm17[5],xmm16[5],xmm17[6],xmm16[6],xmm17[7],xmm16[7] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm13, %ymm13, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm15, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm13, %xmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm13 = xmm13[0],zero,zero,zero,xmm13[1],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm13, %ymm13 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm15, %zmm13, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm23[0],xmm18[0],xmm23[1],xmm18[1],xmm23[2],xmm18[2],xmm23[3],xmm18[3],xmm23[4],xmm18[4],xmm23[5],xmm18[5],xmm23[6],xmm18[6],xmm23[7],xmm18[7] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm15, %ymm15, %ymm15 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm15, %zmm9, %zmm13 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm26[0],xmm24[0],xmm26[1],xmm24[1],xmm26[2],xmm24[2],xmm26[3],xmm24[3],xmm26[4],xmm24[4],xmm26[5],xmm24[5],xmm26[6],xmm24[6],xmm26[7],xmm24[7] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm15, %ymm15, %ymm25 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm15 = xmm30[0],xmm28[0],xmm30[1],xmm28[1],xmm30[2],xmm28[2],xmm30[3],xmm28[3],xmm30[4],xmm28[4],xmm30[5],xmm28[5],xmm30[6],xmm28[6],xmm30[7],xmm28[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm14, %zmm15 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm16 = xmm17[8],xmm16[8],xmm17[9],xmm16[9],xmm17[10],xmm16[10],xmm17[11],xmm16[11],xmm17[12],xmm16[12],xmm17[13],xmm16[13],xmm17[14],xmm16[14],xmm17[15],xmm16[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm16, %xmm17 +; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm16[0],zero,zero,zero,xmm16[1],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm25, %ymm17 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r10), %xmm25 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm16, %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm16, %ymm16 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm16, %zmm17, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rax), %xmm29 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm23[8],xmm18[8],xmm23[9],xmm18[9],xmm23[10],xmm18[10],xmm23[11],xmm18[11],xmm23[12],xmm18[12],xmm23[13],xmm18[13],xmm23[14],xmm18[14],xmm23[15],xmm18[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm27 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm17, %ymm17 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm17, %zmm18, %zmm17 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%rax), %xmm30 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm24[8],xmm19[8],xmm24[9],xmm19[9],xmm24[10],xmm19[10],xmm24[11],xmm19[11],xmm24[12],xmm19[12],xmm24[13],xmm19[13],xmm24[14],xmm19[14],xmm24[15],xmm19[15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r9), %xmm31 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm18, %zmm8, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa 32(%r8), %xmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm26[8],xmm25[8],xmm26[9],xmm25[9],xmm26[10],xmm25[10],xmm26[11],xmm25[11],xmm26[12],xmm25[12],xmm26[13],xmm25[13],xmm26[14],xmm25[14],xmm26[15],xmm25[15] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm18 = xmm28[8],xmm27[8],xmm28[9],xmm27[9],xmm28[10],xmm27[10],xmm28[11],xmm27[11],xmm28[12],xmm27[12],xmm28[13],xmm27[13],xmm28[14],xmm27[14],xmm28[15],xmm27[15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm18, %zmm13, %zmm18 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm19, %zmm14, %zmm18 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm19 = xmm21[0],xmm20[0],xmm21[1],xmm20[1],xmm21[2],xmm20[2],xmm21[3],xmm20[3],xmm21[4],xmm20[4],xmm21[5],xmm20[5],xmm21[6],xmm20[6],xmm21[7],xmm20[7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm24 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm24, %ymm24 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm19, %xmm25 -; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm19 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm19, %ymm19 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm24, %zmm19, %zmm19 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm23[0],xmm22[0],xmm23[1],xmm22[1],xmm23[2],xmm22[2],xmm23[3],xmm22[3],xmm23[4],xmm22[4],xmm23[5],xmm22[5],xmm23[6],xmm22[6],xmm23[7],xmm22[7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm24, %zmm8, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm30[0],xmm29[0],xmm30[1],xmm29[1],xmm30[2],xmm29[2],xmm30[3],xmm29[3],xmm30[4],xmm29[4],xmm30[5],xmm29[5],xmm30[6],xmm29[6],xmm30[7],xmm29[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm24 = xmm0[0],xmm31[0],xmm0[1],xmm31[1],xmm0[2],xmm31[2],xmm0[3],xmm31[3],xmm0[4],xmm31[4],xmm0[5],xmm31[5],xmm0[6],xmm31[6],xmm0[7],xmm31[7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm24, %zmm13, %zmm24 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm25, %zmm14, %zmm24 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm21[8],xmm20[8],xmm21[9],xmm20[9],xmm21[10],xmm20[10],xmm21[11],xmm20[11],xmm21[12],xmm20[12],xmm21[13],xmm20[13],xmm21[14],xmm20[14],xmm21[15],xmm20[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm20, %xmm21 -; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm20[0],zero,zero,zero,xmm20[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm25, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm17, %zmm9, %zmm16 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 32(%r8), %xmm31 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm26[8],xmm24[8],xmm26[9],xmm24[9],xmm26[10],xmm24[10],xmm26[11],xmm24[11],xmm26[12],xmm24[12],xmm26[13],xmm24[13],xmm26[14],xmm24[14],xmm26[15],xmm24[15] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm17, %ymm17, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm17 = xmm30[8],xmm28[8],xmm30[9],xmm28[9],xmm30[10],xmm28[10],xmm30[11],xmm28[11],xmm30[12],xmm28[12],xmm30[13],xmm28[13],xmm30[14],xmm28[14],xmm30[15],xmm28[15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm18, %zmm14, %zmm17 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm18 = xmm20[0],xmm19[0],xmm20[1],xmm19[1],xmm20[2],xmm19[2],xmm20[3],xmm19[3],xmm20[4],xmm19[4],xmm20[5],xmm19[5],xmm20[6],xmm19[6],xmm20[7],xmm19[7] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm18, %ymm18, %ymm23 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm23, %ymm23 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm18, %xmm24 +; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm18 = xmm18[0],zero,zero,zero,xmm18[1],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm24, %ymm18, %ymm18 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm23, %zmm18, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm22[0],xmm21[0],xmm22[1],xmm21[1],xmm22[2],xmm21[2],xmm22[3],xmm21[3],xmm22[4],xmm21[4],xmm22[5],xmm21[5],xmm22[6],xmm21[6],xmm22[7],xmm21[7] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm23, %ymm23 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm23, %zmm9, %zmm18 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm29[0],xmm25[0],xmm29[1],xmm25[1],xmm29[2],xmm25[2],xmm29[3],xmm25[3],xmm29[4],xmm25[4],xmm29[5],xmm25[5],xmm29[6],xmm25[6],xmm29[7],xmm25[7] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm23, %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm23 = xmm31[0],xmm27[0],xmm31[1],xmm27[1],xmm31[2],xmm27[2],xmm31[3],xmm27[3],xmm31[4],xmm27[4],xmm31[5],xmm27[5],xmm31[6],xmm27[6],xmm31[7],xmm27[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm14, %zmm23 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm19 = xmm20[8],xmm19[8],xmm20[9],xmm19[9],xmm20[10],xmm19[10],xmm20[11],xmm19[11],xmm20[12],xmm19[12],xmm20[13],xmm19[13],xmm20[14],xmm19[14],xmm20[15],xmm19[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm19, %xmm20 +; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm24 = xmm19[0],zero,zero,zero,xmm19[1],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm24, %ymm20 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm19, %ymm19, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm19, %ymm19 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm19, %zmm20, %zmm19 +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm24 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm22[8],xmm21[8],xmm22[9],xmm21[9],xmm22[10],xmm21[10],xmm22[11],xmm21[11],xmm22[12],xmm21[12],xmm22[13],xmm21[13],xmm22[14],xmm21[14],xmm22[15],xmm21[15] +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm22 ; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm20, %ymm20 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm20, %zmm21, %zmm20 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rsi), %xmm25 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm22[8],xmm23[9],xmm22[9],xmm23[10],xmm22[10],xmm23[11],xmm22[11],xmm23[12],xmm22[12],xmm23[13],xmm22[13],xmm23[14],xmm22[14],xmm23[15],xmm22[15] -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdi), %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm21, %zmm8, %zmm20 {%k1} +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm20, %zmm9, %zmm19 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rcx), %xmm26 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm30[8],xmm29[8],xmm30[9],xmm29[9],xmm30[10],xmm29[10],xmm30[11],xmm29[11],xmm30[12],xmm29[12],xmm30[13],xmm29[13],xmm30[14],xmm29[14],xmm30[15],xmm29[15] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm31[8],xmm0[9],xmm31[9],xmm0[10],xmm31[10],xmm0[11],xmm31[11],xmm0[12],xmm31[12],xmm0[13],xmm31[13],xmm0[14],xmm31[14],xmm0[15],xmm31[15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm13, %zmm21 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm22, %zmm14, %zmm21 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm23[0],xmm25[0],xmm23[1],xmm25[1],xmm23[2],xmm25[2],xmm23[3],xmm25[3],xmm23[4],xmm25[4],xmm23[5],xmm25[5],xmm23[6],xmm25[6],xmm23[7],xmm25[7] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm22 -; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm27, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm29[8],xmm25[8],xmm29[9],xmm25[9],xmm29[10],xmm25[10],xmm29[11],xmm25[11],xmm29[12],xmm25[12],xmm29[13],xmm25[13],xmm29[14],xmm25[14],xmm29[15],xmm25[15] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm20, %ymm20, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm20 = xmm31[8],xmm27[8],xmm31[9],xmm27[9],xmm31[10],xmm27[10],xmm31[11],xmm27[11],xmm31[12],xmm27[12],xmm31[13],xmm27[13],xmm31[14],xmm27[14],xmm31[15],xmm27[15] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm21, %zmm14, %zmm20 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm21 = xmm22[0],xmm24[0],xmm22[1],xmm24[1],xmm22[2],xmm24[2],xmm22[3],xmm24[3],xmm22[4],xmm24[4],xmm22[5],xmm24[5],xmm22[6],xmm24[6],xmm22[7],xmm24[7] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm21, %xmm25 +; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm27 = xmm21[0],zero,zero,zero,xmm21[1],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm27, %ymm25 ; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%rdx), %xmm27 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm22, %zmm22 -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm0, %zmm8, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm23[8],xmm25[8],xmm23[9],xmm25[9],xmm23[10],xmm25[10],xmm23[11],xmm25[11],xmm23[12],xmm25[12],xmm23[13],xmm25[13],xmm23[14],xmm25[14],xmm23[15],xmm25[15] -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm0, %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm23, %ymm25, %ymm23 -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm0, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm0, %ymm0 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm23, %zmm0 -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm23 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm23, %zmm8, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload -; AVX512DQ-BW-FCP-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7] -; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm2, %ymm2, %ymm23 -; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm4, %ymm23, %ymm4 -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 16(%r10), %xmm23 -; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm5, %xmm2, %xmm5 -; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero -; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm5, %ymm2, %ymm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rax), %xmm5 -; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm4, %zmm2, %zmm2 -; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r9), %xmm4 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm21, %ymm21, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm21, %ymm21 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm21, %zmm25, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm25 = xmm27[0],xmm26[0],xmm27[1],xmm26[1],xmm27[2],xmm26[2],xmm27[3],xmm26[3],xmm27[4],xmm26[4],xmm27[5],xmm26[5],xmm27[6],xmm26[6],xmm27[7],xmm26[7] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm25, %ymm25, %ymm25 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm25, %zmm9, %zmm21 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm22 = xmm22[8],xmm24[8],xmm22[9],xmm24[9],xmm22[10],xmm24[10],xmm22[11],xmm24[11],xmm22[12],xmm24[12],xmm22[13],xmm24[13],xmm22[14],xmm24[14],xmm22[15],xmm24[15] +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm22, %xmm24 +; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm25 = xmm22[0],zero,zero,zero,xmm22[1],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm24, %ymm25, %ymm24 +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm22, %ymm22, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm22, %ymm22 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm22, %zmm24, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm24 = xmm27[8],xmm26[8],xmm27[9],xmm26[9],xmm27[10],xmm26[10],xmm27[11],xmm26[11],xmm27[12],xmm26[12],xmm27[13],xmm26[13],xmm27[14],xmm26[14],xmm27[15],xmm26[15] +; AVX512DQ-BW-FCP-NEXT: vinserti32x4 $1, %xmm24, %ymm24, %ymm24 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm24, %zmm9, %zmm22 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm1, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vpshufb %ymm5, %ymm2, %ymm2 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r10), %xmm5 +; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm6, %xmm1, %xmm6 +; AVX512DQ-BW-FCP-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm6, %ymm1, %ymm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%rax), %xmm6 +; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm2, %zmm1, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r9), %xmm2 ; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm7[0],xmm3[0],xmm7[1],xmm3[1],xmm7[2],xmm3[2],xmm7[3],xmm3[3],xmm7[4],xmm3[4],xmm7[5],xmm3[5],xmm7[6],xmm3[6],xmm7[7],xmm3[7] ; AVX512DQ-BW-FCP-NEXT: vmovdqa 16(%r8), %xmm7 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm8, %zmm2 {%k1} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm23[0],xmm5[1],xmm23[1],xmm5[2],xmm23[2],xmm5[3],xmm23[3],xmm5[4],xmm23[4],xmm5[5],xmm23[5],xmm5[6],xmm23[6],xmm5[7],xmm23[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm8, %zmm13, %zmm8 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm8 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm5[8],xmm23[8],xmm5[9],xmm23[9],xmm5[10],xmm23[10],xmm5[11],xmm23[11],xmm5[12],xmm23[12],xmm5[13],xmm23[13],xmm5[14],xmm23[14],xmm5[15],xmm23[15] -; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm4, %zmm13, %zmm4 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm4 {%k2} -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3],xmm9[4],xmm6[4],xmm9[5],xmm6[5],xmm9[6],xmm6[6],xmm9[7],xmm6[7] -; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm5, %zmm13, %zmm5 -; AVX512DQ-BW-FCP-NEXT: vpermw %zmm3, %zmm14, %zmm5 {%k2} +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm9, %zmm1 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7] +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm9 +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15] +; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm7[8],xmm2[8],xmm7[9],xmm2[9],xmm7[10],xmm2[10],xmm7[11],xmm2[11],xmm7[12],xmm2[12],xmm7[13],xmm2[13],xmm7[14],xmm2[14],xmm7[15],xmm2[15] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm2 +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7] +; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm3, %ymm3, %ymm3 +; AVX512DQ-BW-FCP-NEXT: vpermt2w %zmm3, %zmm14, %zmm4 ; AVX512DQ-BW-FCP-NEXT: movw $-21846, %ax # imm = 0xAAAA ; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1 -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm12, %zmm1 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm16, %zmm15 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm18, %zmm17 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm24, %zmm19 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm21, %zmm20 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm8, %zmm22 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm0 {%k1} -; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm5, %zmm2 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm11, %zmm0 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm15, %zmm13 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm17, %zmm16 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm23, %zmm18 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm20, %zmm19 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm9, %zmm21 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm2, %zmm22 {%k1} +; AVX512DQ-BW-FCP-NEXT: vmovdqa32 %zmm4, %zmm1 {%k1} ; AVX512DQ-BW-FCP-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm2, (%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 192(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 128(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm20, 320(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 256(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm17, 448(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm15, 384(%rax) -; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, 64(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm1, (%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm22, 192(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm21, 128(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm19, 320(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm18, 256(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm16, 448(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm13, 384(%rax) +; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm0, 64(%rax) ; AVX512DQ-BW-FCP-NEXT: vzeroupper ; AVX512DQ-BW-FCP-NEXT: retq %in.vec0 = load <64 x i8>, ptr %in.vecptr0, align 64 diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll index 68967c2ce6536..c33776daf18fa 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512f.ll @@ -964,41 +964,11 @@ define <8 x i64> @combine_vpermvar_insertion_as_broadcast_v8i64(i64 %a0) { } define <16 x i32> @blend_of_permutes_v16i32(<8 x i64> %a0, <8x i64> %a1) { -; X86-AVX512F-LABEL: blend_of_permutes_v16i32: -; X86-AVX512F: # %bb.0: -; X86-AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] -; X86-AVX512F-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; X86-AVX512F-NEXT: movw $-25958, %ax # imm = 0x9A9A -; X86-AVX512F-NEXT: kmovw %eax, %k1 -; X86-AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; X86-AVX512F-NEXT: retl -; -; X86-AVX512BW-LABEL: blend_of_permutes_v16i32: -; X86-AVX512BW: # %bb.0: -; X86-AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] -; X86-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; X86-AVX512BW-NEXT: movw $-25958, %ax # imm = 0x9A9A -; X86-AVX512BW-NEXT: kmovd %eax, %k1 -; X86-AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; X86-AVX512BW-NEXT: retl -; -; X64-AVX512F-LABEL: blend_of_permutes_v16i32: -; X64-AVX512F: # %bb.0: -; X64-AVX512F-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] -; X64-AVX512F-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; X64-AVX512F-NEXT: movw $-25958, %ax # imm = 0x9A9A -; X64-AVX512F-NEXT: kmovw %eax, %k1 -; X64-AVX512F-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; X64-AVX512F-NEXT: retq -; -; X64-AVX512BW-LABEL: blend_of_permutes_v16i32: -; X64-AVX512BW: # %bb.0: -; X64-AVX512BW-NEXT: vpermq {{.*#+}} zmm0 = zmm0[2,3,0,1,6,7,4,5] -; X64-AVX512BW-NEXT: vpermq {{.*#+}} zmm1 = zmm1[2,3,0,1,6,7,4,5] -; X64-AVX512BW-NEXT: movw $-25958, %ax # imm = 0x9A9A -; X64-AVX512BW-NEXT: kmovd %eax, %k1 -; X64-AVX512BW-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1} -; X64-AVX512BW-NEXT: retq +; CHECK-LABEL: blend_of_permutes_v16i32: +; CHECK: # %bb.0: +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [4,21,6,23,16,1,2,19,12,29,14,31,24,9,10,27] +; CHECK-NEXT: vpermt2d %zmm1, %zmm2, %zmm0 +; CHECK-NEXT: ret{{[l|q]}} %s0 = shufflevector <8 x i64> %a0, <8 x i64> undef, <8 x i32> %s1 = shufflevector <8 x i64> %a1, <8 x i64> undef, <8 x i32> %x0 = bitcast <8 x i64> %s0 to <16 x i32> From 4079ed3c9e72d64746c5d3f05fc585d844c1e8a7 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 12 Jun 2025 17:35:55 +0900 Subject: [PATCH 197/851] ARM: Move setting of more runtime libcalls to RuntimeLibcallInfo (#143826) These are the easy cases that do not really depend on the subtarget, other than for the deceptive predicates on the subtarget class. Most of the rest of the cases here also do not, but this is obscured by going through helper predicates added onto the subtarget which hide dependence on TargetOptions. --- llvm/lib/IR/RuntimeLibcalls.cpp | 28 +++++++++++++++++++++++ llvm/lib/Target/ARM/ARMISelLowering.cpp | 30 ------------------------- 2 files changed, 28 insertions(+), 30 deletions(-) diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 31013310a746d..331b319511aed 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -79,6 +79,34 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) { } } } + + if (TT.isOSWindows()) { + static const struct { + const RTLIB::Libcall Op; + const char *const Name; + const CallingConv::ID CC; + } LibraryCalls[] = { + {RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP}, + {RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP}, + {RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP}, + {RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP}, + {RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP}, + {RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP}, + {RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP}, + {RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP}, + }; + + for (const auto &LC : LibraryCalls) { + Info.setLibcallName(LC.Op, LC.Name); + Info.setLibcallCallingConv(LC.Op, LC.CC); + } + } + + // Use divmod compiler-rt calls for iOS 5.0 and later. + if (TT.isOSBinFormatMachO() && (!TT.isiOS() || !TT.isOSVersionLT(5, 0))) { + Info.setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); + Info.setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); + } } static void setMSP430Libcalls(RuntimeLibcallsInfo &Info, const Triple &TT) { diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp index 8455eef9bad32..d2e910a248f23 100644 --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -708,36 +708,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } } - if (Subtarget->isTargetWindows()) { - static const struct { - const RTLIB::Libcall Op; - const char * const Name; - const CallingConv::ID CC; - } LibraryCalls[] = { - { RTLIB::FPTOSINT_F32_I64, "__stoi64", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::FPTOSINT_F64_I64, "__dtoi64", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::FPTOUINT_F32_I64, "__stou64", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::FPTOUINT_F64_I64, "__dtou64", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::SINTTOFP_I64_F32, "__i64tos", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::SINTTOFP_I64_F64, "__i64tod", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::UINTTOFP_I64_F32, "__u64tos", CallingConv::ARM_AAPCS_VFP }, - { RTLIB::UINTTOFP_I64_F64, "__u64tod", CallingConv::ARM_AAPCS_VFP }, - }; - - for (const auto &LC : LibraryCalls) { - setLibcallName(LC.Op, LC.Name); - setLibcallCallingConv(LC.Op, LC.CC); - } - } - - // Use divmod compiler-rt calls for iOS 5.0 and later. - if (Subtarget->isTargetMachO() && - !(Subtarget->isTargetIOS() && - Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { - setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); - setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); - } - // The half <-> float conversion functions are always soft-float on // non-watchos platforms, but are needed for some targets which use a // hard-float calling convention by default. From 5434b85d2c7a83d9cebae06dad2f9d630e9a3927 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 12 Jun 2025 17:38:52 +0900 Subject: [PATCH 198/851] ARM: Remove fake entries for divrem libcalls (#143832) This was defining aliases of the i32 divrem functions for the i8 and i16 cases. This is unnecessary and was unused. The divrem candidate cases wouldn't have formed with illegal types in the first place, so codegen wouldn't even query these. --- llvm/lib/IR/RuntimeLibcalls.cpp | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp index 331b319511aed..d84c56f0af5c6 100644 --- a/llvm/lib/IR/RuntimeLibcalls.cpp +++ b/llvm/lib/IR/RuntimeLibcalls.cpp @@ -41,13 +41,8 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) { const char *const Name; const CallingConv::ID CC; } LibraryCalls[] = { - {RTLIB::SDIVREM_I8, "__rt_sdiv", CallingConv::ARM_AAPCS}, - {RTLIB::SDIVREM_I16, "__rt_sdiv", CallingConv::ARM_AAPCS}, {RTLIB::SDIVREM_I32, "__rt_sdiv", CallingConv::ARM_AAPCS}, {RTLIB::SDIVREM_I64, "__rt_sdiv64", CallingConv::ARM_AAPCS}, - - {RTLIB::UDIVREM_I8, "__rt_udiv", CallingConv::ARM_AAPCS}, - {RTLIB::UDIVREM_I16, "__rt_udiv", CallingConv::ARM_AAPCS}, {RTLIB::UDIVREM_I32, "__rt_udiv", CallingConv::ARM_AAPCS}, {RTLIB::UDIVREM_I64, "__rt_udiv64", CallingConv::ARM_AAPCS}, }; @@ -62,13 +57,8 @@ static void setARMLibcallNames(RuntimeLibcallsInfo &Info, const Triple &TT) { const char *const Name; const CallingConv::ID CC; } LibraryCalls[] = { - {RTLIB::SDIVREM_I8, "__aeabi_idivmod", CallingConv::ARM_AAPCS}, - {RTLIB::SDIVREM_I16, "__aeabi_idivmod", CallingConv::ARM_AAPCS}, {RTLIB::SDIVREM_I32, "__aeabi_idivmod", CallingConv::ARM_AAPCS}, {RTLIB::SDIVREM_I64, "__aeabi_ldivmod", CallingConv::ARM_AAPCS}, - - {RTLIB::UDIVREM_I8, "__aeabi_uidivmod", CallingConv::ARM_AAPCS}, - {RTLIB::UDIVREM_I16, "__aeabi_uidivmod", CallingConv::ARM_AAPCS}, {RTLIB::UDIVREM_I32, "__aeabi_uidivmod", CallingConv::ARM_AAPCS}, {RTLIB::UDIVREM_I64, "__aeabi_uldivmod", CallingConv::ARM_AAPCS}, }; From ce621041c2f162c50d630810491c2feee8eb6c64 Mon Sep 17 00:00:00 2001 From: Pengcheng Wang Date: Thu, 12 Jun 2025 16:39:57 +0800 Subject: [PATCH 199/851] [RISCV] Get host CPU name via hwprobe (#142745) We can get the `mvendorid/marchid/mimpid` via hwprobe and then we can compare these IDs with those defined in processors to find the CPU name. With this change, `-mcpu/-mtune=native` can set the proper name. --- .../llvm/TargetParser/RISCVTargetParser.h | 8 +++++ llvm/lib/TargetParser/Host.cpp | 30 +++++++++++++++---- llvm/lib/TargetParser/RISCVTargetParser.cpp | 15 +++++++--- 3 files changed, 44 insertions(+), 9 deletions(-) diff --git a/llvm/include/llvm/TargetParser/RISCVTargetParser.h b/llvm/include/llvm/TargetParser/RISCVTargetParser.h index 41fdab6012aa0..19a8af0cb9567 100644 --- a/llvm/include/llvm/TargetParser/RISCVTargetParser.h +++ b/llvm/include/llvm/TargetParser/RISCVTargetParser.h @@ -29,6 +29,13 @@ struct CPUModel { uint32_t MVendorID; uint64_t MArchID; uint64_t MImpID; + + bool isValid() const { return MVendorID != 0 && MArchID != 0 && MImpID != 0; } + + bool operator==(const CPUModel &Other) const { + return MVendorID == Other.MVendorID && MArchID == Other.MArchID && + MImpID == Other.MImpID; + } }; struct CPUInfo { @@ -58,6 +65,7 @@ LLVM_ABI bool hasFastScalarUnalignedAccess(StringRef CPU); LLVM_ABI bool hasFastVectorUnalignedAccess(StringRef CPU); LLVM_ABI bool hasValidCPUModel(StringRef CPU); LLVM_ABI CPUModel getCPUModel(StringRef CPU); +LLVM_ABI StringRef getCPUNameFromCPUModel(const CPUModel &Model); } // namespace RISCV diff --git a/llvm/lib/TargetParser/Host.cpp b/llvm/lib/TargetParser/Host.cpp index 14acef116708a..5957e1befe2da 100644 --- a/llvm/lib/TargetParser/Host.cpp +++ b/llvm/lib/TargetParser/Host.cpp @@ -18,6 +18,7 @@ #include "llvm/Config/llvm-config.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/TargetParser/RISCVTargetParser.h" #include "llvm/TargetParser/Triple.h" #include "llvm/TargetParser/X86TargetParser.h" #include @@ -1672,8 +1673,32 @@ StringRef sys::getHostCPUName() { return "generic"; } #elif defined(__riscv) +#if defined(__linux__) +// struct riscv_hwprobe +struct RISCVHwProbe { + int64_t Key; + uint64_t Value; +}; +#endif + StringRef sys::getHostCPUName() { #if defined(__linux__) + // Try the hwprobe way first. + RISCVHwProbe Query[]{{/*RISCV_HWPROBE_KEY_MVENDORID=*/0, 0}, + {/*RISCV_HWPROBE_KEY_MARCHID=*/1, 0}, + {/*RISCV_HWPROBE_KEY_MIMPID=*/2, 0}}; + int Ret = syscall(/*__NR_riscv_hwprobe=*/258, /*pairs=*/Query, + /*pair_count=*/std::size(Query), /*cpu_count=*/0, + /*cpus=*/0, /*flags=*/0); + if (Ret == 0) { + RISCV::CPUModel Model{static_cast(Query[0].Value), Query[1].Value, + Query[2].Value}; + StringRef Name = RISCV::getCPUNameFromCPUModel(Model); + if (!Name.empty()) + return Name; + } + + // Then try the cpuinfo way. std::unique_ptr P = getProcCpuinfoContent(); StringRef Content = P ? P->getBuffer() : ""; StringRef Name = detail::getHostCPUNameForRISCV(Content); @@ -2148,11 +2173,6 @@ const StringMap sys::getHostCPUFeatures() { return Features; } #elif defined(__linux__) && defined(__riscv) -// struct riscv_hwprobe -struct RISCVHwProbe { - int64_t Key; - uint64_t Value; -}; const StringMap sys::getHostCPUFeatures() { RISCVHwProbe Query[]{{/*RISCV_HWPROBE_KEY_BASE_BEHAVIOR=*/3, 0}, {/*RISCV_HWPROBE_KEY_IMA_EXT_0=*/4, 0}, diff --git a/llvm/lib/TargetParser/RISCVTargetParser.cpp b/llvm/lib/TargetParser/RISCVTargetParser.cpp index 2e5e8f4e50c9c..9957ec0c28d88 100644 --- a/llvm/lib/TargetParser/RISCVTargetParser.cpp +++ b/llvm/lib/TargetParser/RISCVTargetParser.cpp @@ -57,10 +57,7 @@ bool hasFastVectorUnalignedAccess(StringRef CPU) { return Info && Info->FastVectorUnalignedAccess; } -bool hasValidCPUModel(StringRef CPU) { - const CPUModel Model = getCPUModel(CPU); - return Model.MVendorID != 0 && Model.MArchID != 0 && Model.MImpID != 0; -} +bool hasValidCPUModel(StringRef CPU) { return getCPUModel(CPU).isValid(); } CPUModel getCPUModel(StringRef CPU) { const CPUInfo *Info = getCPUInfoByName(CPU); @@ -69,6 +66,16 @@ CPUModel getCPUModel(StringRef CPU) { return Info->Model; } +StringRef getCPUNameFromCPUModel(const CPUModel &Model) { + if (!Model.isValid()) + return ""; + + for (auto &C : RISCVCPUInfo) + if (C.Model == Model) + return C.Name; + return ""; +} + bool parseCPU(StringRef CPU, bool IsRV64) { const CPUInfo *Info = getCPUInfoByName(CPU); From 4551e5035565606eb04253a35f31d51685657436 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= Date: Thu, 12 Jun 2025 10:49:23 +0200 Subject: [PATCH 200/851] [clang] Reset FileID based diag state mappings (#143695) When sharing same compiler instance for multiple compilations, we reset source manager's file id tables in between runs. Diagnostics engine keeps a cache based on these file ids, that became dangling references across compilations. This patch makes sure we reset those whenever sourcemanager is trashing its FileIDs. --- clang/include/clang/Basic/Diagnostic.h | 13 +++-- clang/lib/Basic/Diagnostic.cpp | 4 +- clang/lib/Basic/SourceManager.cpp | 3 ++ .../Frontend/CompilerInstanceTest.cpp | 51 +++++++++++++++++++ 4 files changed, 67 insertions(+), 4 deletions(-) diff --git a/clang/include/clang/Basic/Diagnostic.h b/clang/include/clang/Basic/Diagnostic.h index efee8302e7501..7ae4ef7df138c 100644 --- a/clang/include/clang/Basic/Diagnostic.h +++ b/clang/include/clang/Basic/Diagnostic.h @@ -424,10 +424,13 @@ class DiagnosticsEngine : public RefCountedBase { bool empty() const { return Files.empty(); } /// Clear out this map. - void clear() { + void clear(bool Soft) { + // Just clear the cache when in soft mode. Files.clear(); - FirstDiagState = CurDiagState = nullptr; - CurDiagStateLoc = SourceLocation(); + if (!Soft) { + FirstDiagState = CurDiagState = nullptr; + CurDiagStateLoc = SourceLocation(); + } } /// Produce a debugging dump of the diagnostic state. @@ -920,6 +923,10 @@ class DiagnosticsEngine : public RefCountedBase { /// Reset the state of the diagnostic object to its initial configuration. /// \param[in] soft - if true, doesn't reset the diagnostic mappings and state void Reset(bool soft = false); + /// We keep a cache of FileIDs for diagnostics mapped by pragmas. These might + /// get invalidated when diagnostics engine is shared across different + /// compilations. Provide users with a way to reset that. + void ResetPragmas(); //===--------------------------------------------------------------------===// // DiagnosticsEngine classification and reporting interfaces. diff --git a/clang/lib/Basic/Diagnostic.cpp b/clang/lib/Basic/Diagnostic.cpp index 95d86cb153b4b..a30bfa28eca71 100644 --- a/clang/lib/Basic/Diagnostic.cpp +++ b/clang/lib/Basic/Diagnostic.cpp @@ -119,6 +119,8 @@ bool DiagnosticsEngine::popMappings(SourceLocation Loc) { return true; } +void DiagnosticsEngine::ResetPragmas() { DiagStatesByLoc.clear(/*Soft=*/true); } + void DiagnosticsEngine::Reset(bool soft /*=false*/) { ErrorOccurred = false; UncompilableErrorOccurred = false; @@ -135,7 +137,7 @@ void DiagnosticsEngine::Reset(bool soft /*=false*/) { if (!soft) { // Clear state related to #pragma diagnostic. DiagStates.clear(); - DiagStatesByLoc.clear(); + DiagStatesByLoc.clear(false); DiagStateOnPushStack.clear(); // Create a DiagState and DiagStatePoint representing diagnostic changes diff --git a/clang/lib/Basic/SourceManager.cpp b/clang/lib/Basic/SourceManager.cpp index 09e5c6547fb51..053e82683a4a6 100644 --- a/clang/lib/Basic/SourceManager.cpp +++ b/clang/lib/Basic/SourceManager.cpp @@ -344,6 +344,9 @@ void SourceManager::clearIDTables() { NextLocalOffset = 0; CurrentLoadedOffset = MaxLoadedOffset; createExpansionLoc(SourceLocation(), SourceLocation(), SourceLocation(), 1); + // Diagnostics engine keeps some references to fileids, mostly for dealing + // with diagnostic pragmas, make sure they're reset as well. + Diag.ResetPragmas(); } bool SourceManager::isMainFile(const FileEntry &SourceFile) { diff --git a/clang/unittests/Frontend/CompilerInstanceTest.cpp b/clang/unittests/Frontend/CompilerInstanceTest.cpp index a7b258d5e537e..459a3864887e1 100644 --- a/clang/unittests/Frontend/CompilerInstanceTest.cpp +++ b/clang/unittests/Frontend/CompilerInstanceTest.cpp @@ -9,9 +9,12 @@ #include "clang/Frontend/CompilerInstance.h" #include "clang/Basic/FileManager.h" #include "clang/Frontend/CompilerInvocation.h" +#include "clang/Frontend/FrontendActions.h" #include "clang/Frontend/TextDiagnosticPrinter.h" +#include "llvm/ADT/IntrusiveRefCntPtr.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/Format.h" +#include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/ToolOutputFile.h" #include "llvm/Support/VirtualFileSystem.h" #include "gtest/gtest.h" @@ -97,4 +100,52 @@ TEST(CompilerInstance, AllowDiagnosticLogWithUnownedDiagnosticConsumer) { ASSERT_EQ(DiagnosticOutput, "error: expected no crash\n"); } +TEST(CompilerInstance, MultipleInputsCleansFileIDs) { + auto VFS = makeIntrusiveRefCnt(); + VFS->addFile("a.cc", /*ModificationTime=*/{}, + MemoryBuffer::getMemBuffer(R"cpp( + #include "a.h" + )cpp")); + // Paddings of `void foo();` in the sources below are "important". We're + // testing against source locations from previous compilations colliding. + // Hence the `unused` variable in `b.h` needs to be within `#pragma clang + // diagnostic` block from `a.h`. + VFS->addFile("a.h", /*ModificationTime=*/{}, MemoryBuffer::getMemBuffer(R"cpp( + #include "b.h" + #pragma clang diagnostic push + #pragma clang diagnostic warning "-Wunused" + void foo(); + #pragma clang diagnostic pop + )cpp")); + VFS->addFile("b.h", /*ModificationTime=*/{}, MemoryBuffer::getMemBuffer(R"cpp( + void foo(); void foo(); void foo(); void foo(); + inline void foo() { int unused = 2; } + )cpp")); + + DiagnosticOptions DiagOpts; + IntrusiveRefCntPtr Diags = + CompilerInstance::createDiagnostics(*VFS, DiagOpts); + + CreateInvocationOptions CIOpts; + CIOpts.Diags = Diags; + + const char *Args[] = {"clang", "-xc++", "a.cc"}; + std::shared_ptr CInvok = + createInvocation(Args, std::move(CIOpts)); + ASSERT_TRUE(CInvok) << "could not create compiler invocation"; + + CompilerInstance Instance(std::move(CInvok)); + Instance.setDiagnostics(Diags.get()); + Instance.createFileManager(VFS); + + // Run once for `a.cc` and then for `a.h`. This makes sure we get the same + // file ID for `b.h` in the second run as `a.h` from first run. + const auto &OrigInputKind = Instance.getFrontendOpts().Inputs[0].getKind(); + Instance.getFrontendOpts().Inputs.emplace_back("a.h", OrigInputKind); + + SyntaxOnlyAction Act; + EXPECT_TRUE(Instance.ExecuteAction(Act)) << "Failed to execute action"; + EXPECT_FALSE(Diags->hasErrorOccurred()); + EXPECT_EQ(Diags->getNumWarnings(), 0u); +} } // anonymous namespace From db8d34db26e9ea92c08d6e813eca9cce40c48478 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Thu, 12 Jun 2025 10:04:08 +0100 Subject: [PATCH 201/851] [VPlan] Set branch weight metadata on middle term in VPlan (NFC) (#143035) Manage branch weights for the BranchOnCond in the middle block in VPlan. This requires updating VPInstruction to inherit from VPIRMetadata, which in general makes sense as there are a number of opcodes that could take metadata. There are other branches (part of the skeleton) that also need branch weights adding. PR: https://github.com/llvm/llvm-project/pull/143035 --- .../Transforms/Vectorize/LoopVectorize.cpp | 48 ++++++++++------- llvm/lib/Transforms/Vectorize/VPlan.h | 53 ++++++++++--------- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 6 ++- 3 files changed, 62 insertions(+), 45 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index d236111836391..93ab3353a296a 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7273,6 +7273,33 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( BypassBlock, MainResumePhi->getIncomingValueForBlock(BypassBlock)); } +/// Add branch weight metadata, if the \p Plan's middle block is terminated by a +/// BranchOnCond recipe. +static void addBranchWeightToMiddleTerminator(VPlan &Plan, ElementCount VF, + Loop *OrigLoop) { + // 4. Adjust branch weight of the branch in the middle block. + Instruction *LatchTerm = OrigLoop->getLoopLatch()->getTerminator(); + if (!hasBranchWeightMD(*LatchTerm)) + return; + + VPBasicBlock *MiddleVPBB = Plan.getMiddleBlock(); + auto *MiddleTerm = + dyn_cast_or_null(MiddleVPBB->getTerminator()); + // Only add branch metadata if there is a (conditional) terminator. + if (!MiddleTerm) + return; + + assert(MiddleTerm->getOpcode() == VPInstruction::BranchOnCond && + "must have a BranchOnCond"); + // Assume that `Count % VectorTripCount` is equally distributed. + unsigned TripCount = Plan.getUF() * VF.getKnownMinValue(); + assert(TripCount > 0 && "trip count should not be zero"); + MDBuilder MDB(LatchTerm->getContext()); + MDNode *BranchWeights = + MDB.createBranchWeights({1, TripCount - 1}, /*IsExpected=*/false); + MiddleTerm->addMetadata(LLVMContext::MD_prof, BranchWeights); +} + DenseMap LoopVectorizationPlanner::executePlan( ElementCount BestVF, unsigned BestUF, VPlan &BestVPlan, InnerLoopVectorizer &ILV, DominatorTree *DT, bool VectorizingEpilogue) { @@ -7295,11 +7322,8 @@ DenseMap LoopVectorizationPlanner::executePlan( VPlanTransforms::convertToConcreteRecipes(BestVPlan, *Legal->getWidestInductionType()); - // Retrieve and store the middle block before dissolving regions. Regions are - // dissolved after optimizing for VF and UF, which completely removes unneeded - // loop regions first. - VPBasicBlock *MiddleVPBB = - BestVPlan.getVectorLoopRegion() ? BestVPlan.getMiddleBlock() : nullptr; + + addBranchWeightToMiddleTerminator(BestVPlan, BestVF, OrigLoop); VPlanTransforms::dissolveLoopRegions(BestVPlan); // Perform the actual loop transformation. VPTransformState State(&TTI, BestVF, LI, DT, ILV.AC, ILV.Builder, &BestVPlan, @@ -7442,20 +7466,6 @@ DenseMap LoopVectorizationPlanner::executePlan( ILV.printDebugTracesAtEnd(); - // 4. Adjust branch weight of the branch in the middle block. - if (HeaderVPBB) { - auto *MiddleTerm = - cast(State.CFG.VPBB2IRBB[MiddleVPBB]->getTerminator()); - if (MiddleTerm->isConditional() && - hasBranchWeightMD(*OrigLoop->getLoopLatch()->getTerminator())) { - // Assume that `Count % VectorTripCount` is equally distributed. - unsigned TripCount = BestVPlan.getUF() * State.VF.getKnownMinValue(); - assert(TripCount > 0 && "trip count should not be zero"); - const uint32_t Weights[] = {1, TripCount - 1}; - setBranchWeights(*MiddleTerm, Weights, /*IsExpected=*/false); - } - } - return ExpandedSCEVs; } diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h index acc861b991975..468284168e9ca 100644 --- a/llvm/lib/Transforms/Vectorize/VPlan.h +++ b/llvm/lib/Transforms/Vectorize/VPlan.h @@ -882,11 +882,39 @@ template class VPUnrollPartAccessor { unsigned getUnrollPart(VPUser &U) const; }; +/// Helper to manage IR metadata for recipes. It filters out metadata that +/// cannot be propagated. +class VPIRMetadata { + SmallVector> Metadata; + +public: + VPIRMetadata() {} + + /// Adds metatadata that can be preserved from the original instruction + /// \p I. + VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); } + + /// Adds metatadata that can be preserved from the original instruction + /// \p I and noalias metadata guaranteed by runtime checks using \p LVer. + VPIRMetadata(Instruction &I, LoopVersioning *LVer); + + /// Copy constructor for cloning. + VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {} + + /// Add all metadata to \p I. + void applyMetadata(Instruction &I) const; + + void addMetadata(unsigned Kind, MDNode *Node) { + Metadata.emplace_back(Kind, Node); + } +}; + /// This is a concrete Recipe that models a single VPlan-level instruction. /// While as any Recipe it may generate a sequence of IR instructions when /// executed, these instructions would always form a single-def expression as /// the VPInstruction is also a single def-use vertex. class VPInstruction : public VPRecipeWithIRFlags, + public VPIRMetadata, public VPUnrollPartAccessor<1> { friend class VPlanSlp; @@ -976,7 +1004,7 @@ class VPInstruction : public VPRecipeWithIRFlags, VPInstruction(unsigned Opcode, ArrayRef Operands, DebugLoc DL = {}, const Twine &Name = "") : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL), - Opcode(Opcode), Name(Name.str()) {} + VPIRMetadata(), Opcode(Opcode), Name(Name.str()) {} VPInstruction(unsigned Opcode, ArrayRef Operands, const VPIRFlags &Flags, DebugLoc DL = {}, @@ -1268,29 +1296,6 @@ struct VPIRPhi : public VPIRInstruction, public VPPhiAccessors { const VPRecipeBase *getAsRecipe() const override { return this; } }; -/// Helper to manage IR metadata for recipes. It filters out metadata that -/// cannot be propagated. -class VPIRMetadata { - SmallVector> Metadata; - -public: - VPIRMetadata() {} - - /// Adds metatadata that can be preserved from the original instruction - /// \p I. - VPIRMetadata(Instruction &I) { getMetadataToPropagate(&I, Metadata); } - - /// Adds metatadata that can be preserved from the original instruction - /// \p I and noalias metadata guaranteed by runtime checks using \p LVer. - VPIRMetadata(Instruction &I, LoopVersioning *LVer); - - /// Copy constructor for cloning. - VPIRMetadata(const VPIRMetadata &Other) : Metadata(Other.Metadata) {} - - /// Add all metadata to \p I. - void applyMetadata(Instruction &I) const; -}; - /// VPWidenRecipe is a recipe for producing a widened instruction using the /// opcode and operands of the recipe. This recipe covers most of the /// traditional vectorization cases where each recipe transforms into a diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 62b99d98a2b5e..f5a2533727b3d 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -410,7 +410,7 @@ VPInstruction::VPInstruction(unsigned Opcode, ArrayRef Operands, const VPIRFlags &Flags, DebugLoc DL, const Twine &Name) : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, Flags, DL), - Opcode(Opcode), Name(Name.str()) { + VPIRMetadata(), Opcode(Opcode), Name(Name.str()) { assert(flagsValidForOpcode(getOpcode()) && "Set flags not supported for the provided opcode"); } @@ -591,7 +591,9 @@ Value *VPInstruction::generate(VPTransformState &State) { } case VPInstruction::BranchOnCond: { Value *Cond = State.get(getOperand(0), VPLane(0)); - return createCondBranch(Cond, getParent(), State); + auto *Br = createCondBranch(Cond, getParent(), State); + applyMetadata(*Br); + return Br; } case VPInstruction::BranchOnCount: { // First create the compare. From 2a27c059eccd96b6e46464dbdf69fd2f6237a56c Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Thu, 12 Jun 2025 10:46:08 +0100 Subject: [PATCH 202/851] [X86] Use BSR passthrough behaviour to fold (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X) (#143662) Make use of targets that support BSR "pass through behaviour" on a zero input to remove a CMOV thats performing the same function BSF will be a trickier patch as we need to make sure it works with the "REP BSF" hack in X86MCInstLower --- llvm/lib/Target/X86/X86ISelLowering.cpp | 10 ++++++++++ llvm/test/CodeGen/X86/bsr.ll | 10 ++++------ llvm/test/CodeGen/X86/pr40090.ll | 11 ++++------- 3 files changed, 18 insertions(+), 13 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index b0553aa4b8197..f0fbf55e97be9 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -49398,6 +49398,8 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, // (ADD (CMOV C1-C2, (CTTZ X), (X != 0)), C2) // Or (CMOV (ADD (CTTZ X), C2), C1, (X == 0)) -> // (ADD (CMOV (CTTZ X), C1-C2, (X == 0)), C2) + // Or (CMOV (BSR ?, X), Y, (X == 0)) -> (BSR Y, X) + // TODO: Or (CMOV (BSF ?, X), Y, (X == 0)) -> (BSF Y, X) if ((CC == X86::COND_NE || CC == X86::COND_E) && Cond.getOpcode() == X86ISD::CMP && isNullConstant(Cond.getOperand(1))) { SDValue Add = TrueOp; @@ -49406,6 +49408,14 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG, if (CC == X86::COND_E) std::swap(Add, Const); + // TODO: ADD BSF support, but requires changes to the "REP BSF" CTTZ hack. + if (Subtarget.hasBitScanPassThrough() && Add.getOpcode() == X86ISD::BSR && + Add.getResNo() == 0 && Add.hasOneUse() && + Add.getOperand(1) == Cond.getOperand(0)) { + return DAG.getNode(Add.getOpcode(), DL, Add->getVTList(), Const, + Add.getOperand(1)); + } + // We might have replaced the constant in the cmov with the LHS of the // compare. If so change it to the RHS of the compare. if (Const == Cond.getOperand(0)) diff --git a/llvm/test/CodeGen/X86/bsr.ll b/llvm/test/CodeGen/X86/bsr.ll index 1247b3ec59324..fbca4af425eac 100644 --- a/llvm/test/CodeGen/X86/bsr.ll +++ b/llvm/test/CodeGen/X86/bsr.ll @@ -162,9 +162,8 @@ define i32 @cmov_bsr32(i32 %x, i32 %y) nounwind { ; ; X64-LABEL: cmov_bsr32: ; X64: # %bb.0: -; X64-NEXT: movl $63, %eax +; X64-NEXT: movl %esi, %eax ; X64-NEXT: bsrl %edi, %eax -; X64-NEXT: cmovel %esi, %eax ; X64-NEXT: retq %1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 false) %2 = xor i32 %1, 31 @@ -188,8 +187,8 @@ define i32 @cmov_bsr32_undef(i32 %x, i32 %y) nounwind { ; ; X64-LABEL: cmov_bsr32_undef: ; X64: # %bb.0: +; X64-NEXT: movl %esi, %eax ; X64-NEXT: bsrl %edi, %eax -; X64-NEXT: cmovel %esi, %eax ; X64-NEXT: retq %1 = tail call i32 @llvm.ctlz.i32(i32 %x, i1 true) %2 = xor i32 %1, 31 @@ -239,9 +238,8 @@ define i64 @cmov_bsr64(i64 %x, i64 %y) nounwind { ; ; X64-LABEL: cmov_bsr64: ; X64: # %bb.0: -; X64-NEXT: movl $127, %eax +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: bsrq %rdi, %rax -; X64-NEXT: cmoveq %rsi, %rax ; X64-NEXT: retq %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 false) %2 = xor i64 %1, 63 @@ -279,8 +277,8 @@ define i64 @cmov_bsr64_undef(i64 %x, i64 %y) nounwind { ; ; X64-LABEL: cmov_bsr64_undef: ; X64: # %bb.0: +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: bsrq %rdi, %rax -; X64-NEXT: cmoveq %rsi, %rax ; X64-NEXT: retq %1 = tail call i64 @llvm.ctlz.i64(i64 %x, i1 true) %2 = xor i64 %1, 63 diff --git a/llvm/test/CodeGen/X86/pr40090.ll b/llvm/test/CodeGen/X86/pr40090.ll index 24e957ac59f52..af933c950e111 100644 --- a/llvm/test/CodeGen/X86/pr40090.ll +++ b/llvm/test/CodeGen/X86/pr40090.ll @@ -4,10 +4,9 @@ define i64 @foo(i64 %x, i64 %y) { ; CHECK-LABEL: foo: ; CHECK: # %bb.0: -; CHECK-NEXT: bsrq %rdi, %rax -; CHECK-NEXT: orq $64, %rax +; CHECK-NEXT: bsrq %rdi, %rcx +; CHECK-NEXT: orq $64, %rcx ; CHECK-NEXT: bsrq %rsi, %rcx -; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: movl $63, %eax ; CHECK-NEXT: subq %rcx, %rax ; CHECK-NEXT: retq @@ -25,11 +24,9 @@ define i64 @bar(i64 %x, i64 %y) { ; CHECK-LABEL: bar: ; CHECK: # %bb.0: ; CHECK-NEXT: movl $127, %ecx -; CHECK-NEXT: movl $127, %eax -; CHECK-NEXT: bsrq %rdi, %rax -; CHECK-NEXT: xorq $64, %rax +; CHECK-NEXT: bsrq %rdi, %rcx +; CHECK-NEXT: xorq $64, %rcx ; CHECK-NEXT: bsrq %rsi, %rcx -; CHECK-NEXT: cmoveq %rax, %rcx ; CHECK-NEXT: movl $63, %eax ; CHECK-NEXT: subq %rcx, %rax ; CHECK-NEXT: retq From 1d1f9afe911c360b9505b5fd2c712cb112c8aa5f Mon Sep 17 00:00:00 2001 From: Chuanqi Xu Date: Thu, 12 Jun 2025 17:42:00 +0800 Subject: [PATCH 203/851] [C++20] [Modules] Treat directly imported internal partition unit as reachable Close https://github.com/llvm/llvm-project/issues/143788 See the discussion for details. --- clang/lib/Sema/SemaLookup.cpp | 23 ++++++++++++++++++----- clang/lib/Sema/SemaModule.cpp | 13 +++++++------ clang/test/Modules/pr143788.cppm | 28 ++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 11 deletions(-) create mode 100644 clang/test/Modules/pr143788.cppm diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp index eef134b158438..91822909f1fd3 100644 --- a/clang/lib/Sema/SemaLookup.cpp +++ b/clang/lib/Sema/SemaLookup.cpp @@ -1978,6 +1978,8 @@ bool LookupResult::isReachableSlow(Sema &SemaRef, NamedDecl *D) { if (D->isModulePrivate()) return false; + Module *DeclTopModule = DeclModule->getTopLevelModule(); + // [module.reach]/p1 // A translation unit U is necessarily reachable from a point P if U is a // module interface unit on which the translation unit containing P has an @@ -1996,17 +1998,28 @@ bool LookupResult::isReachableSlow(Sema &SemaRef, NamedDecl *D) { // // Here we only check for the first condition. Since we couldn't see // DeclModule if it isn't (transitively) imported. - if (DeclModule->getTopLevelModule()->isModuleInterfaceUnit()) + if (DeclTopModule->isModuleInterfaceUnit()) return true; - // [module.reach]/p2 + // [module.reach]/p1,2 + // A translation unit U is necessarily reachable from a point P if U is a + // module interface unit on which the translation unit containing P has an + // interface dependency, or the translation unit containing P imports U, in + // either case prior to P + // // Additional translation units on // which the point within the program has an interface dependency may be // considered reachable, but it is unspecified which are and under what // circumstances. - // - // The decision here is to treat all additional tranditional units as - // unreachable. + Module *CurrentM = SemaRef.getCurrentModule(); + + // Directly imported module are necessarily reachable. + // Since we can't export import a module implementation partition unit, we + // don't need to count for Exports here. + if (CurrentM && CurrentM->getTopLevelModule()->Imports.count(DeclTopModule)) + return true; + + // Then we treat all module implementation partition unit as unreachable. return false; } diff --git a/clang/lib/Sema/SemaModule.cpp b/clang/lib/Sema/SemaModule.cpp index 6c4df0aa35af5..9fcaad48d3058 100644 --- a/clang/lib/Sema/SemaModule.cpp +++ b/clang/lib/Sema/SemaModule.cpp @@ -712,7 +712,13 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, Mod->Kind == Module::ModuleKind::ModulePartitionImplementation) { Diag(ExportLoc, diag::err_export_partition_impl) << SourceRange(ExportLoc, Path.back().getLoc()); - } else if (!ModuleScopes.empty() && !currentModuleIsImplementation()) { + } else if (ExportLoc.isValid() && + (ModuleScopes.empty() || currentModuleIsImplementation())) { + // [module.interface]p1: + // An export-declaration shall inhabit a namespace scope and appear in the + // purview of a module interface unit. + Diag(ExportLoc, diag::err_export_not_in_module_interface); + } else if (!ModuleScopes.empty()) { // Re-export the module if the imported module is exported. // Note that we don't need to add re-exported module to Imports field // since `Exports` implies the module is imported already. @@ -720,11 +726,6 @@ DeclResult Sema::ActOnModuleImport(SourceLocation StartLoc, getCurrentModule()->Exports.emplace_back(Mod, false); else getCurrentModule()->Imports.insert(Mod); - } else if (ExportLoc.isValid()) { - // [module.interface]p1: - // An export-declaration shall inhabit a namespace scope and appear in the - // purview of a module interface unit. - Diag(ExportLoc, diag::err_export_not_in_module_interface); } return Import; diff --git a/clang/test/Modules/pr143788.cppm b/clang/test/Modules/pr143788.cppm new file mode 100644 index 0000000000000..5ae36d8d0e85a --- /dev/null +++ b/clang/test/Modules/pr143788.cppm @@ -0,0 +1,28 @@ +// RUN: rm -rf %t +// RUN: mkdir -p %t +// RUN: split-file %s %t +// +// RUN: %clang_cc1 -std=c++20 %t/M.cppm -emit-module-interface -o %t/M.pcm +// RUN: %clang_cc1 -std=c++20 %t/P.cppm -emit-module-interface -o %t/P.pcm +// RUN: %clang_cc1 -std=c++20 %t/I.cpp -fmodule-file=M:P=%t/P.pcm -fmodule-file=M=%t/M.pcm -fsyntax-only -verify + +//--- H.hpp +struct S{}; + +//--- M.cppm +export module M; + + +//--- P.cppm +module; +#include "H.hpp" +module M:P; + +using T = S; + +//--- I.cpp +// expected-no-diagnostics +module M; +import :P; + +T f() { return {}; } From 8e4fdff6f02161d878a63900abb35aaa32ff85e9 Mon Sep 17 00:00:00 2001 From: Omair Javaid Date: Thu, 12 Jun 2025 14:48:13 +0500 Subject: [PATCH 204/851] [X86] Update tailcc-ssp.ll assertions using update_llc_test_checks.py (#143500) The assertions in llvm/test/CodeGen/X86/tailcc-ssp.ll were outdated. The initial comment indicated they were generated with `utils/update_llc_test_checks.py UTC_ARGS: --version 5`, but this was not accurate based on the file's content. Running `utils/update_llc_test_checks.py` regenerated the assertions, aligning them with the current `llc` output. This commit ensures that the test's claimed behavior accurately reflects the actual `llc` output, even though the tests were already passing. This was identified by @efriedma-quic during review of #136290. Submitting a separate PR to make sure these changes stay isolated. --- llvm/test/CodeGen/X86/tailcc-ssp.ll | 55 ++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/llvm/test/CodeGen/X86/tailcc-ssp.ll b/llvm/test/CodeGen/X86/tailcc-ssp.ll index 5211e4fe9eef9..7ea5dd49f0242 100644 --- a/llvm/test/CodeGen/X86/tailcc-ssp.ll +++ b/llvm/test/CodeGen/X86/tailcc-ssp.ll @@ -78,7 +78,7 @@ define void @tailcall_unrelated_frame() sspreq { ; WINDOWS-NEXT: callq __security_check_cookie ; WINDOWS-NEXT: int3 ; WINDOWS-NEXT: .seh_endproc - +; ; LINUX-LABEL: tailcall_unrelated_frame: ; LINUX: # %bb.0: ; LINUX-NEXT: pushq %rax @@ -97,6 +97,7 @@ define void @tailcall_unrelated_frame() sspreq { ; LINUX-NEXT: .cfi_def_cfa_offset 16 ; LINUX-NEXT: callq __stack_chk_fail@PLT + call void @bar() tail call void @bar() ret void @@ -105,18 +106,48 @@ define void @tailcall_unrelated_frame() sspreq { declare void @callee() define void @caller() sspreq { ; WINDOWS-LABEL: caller: -; WINDOWS: callq callee -; WINDOWS: callq callee -; WINDOWS: cmpq __security_cookie(%rip), %rcx -; WINDOWS: jne -; WINDOWS: callq __security_check_cookie - +; WINDOWS: # %bb.0: +; WINDOWS-NEXT: subq $40, %rsp +; WINDOWS-NEXT: .seh_stackalloc 40 +; WINDOWS-NEXT: .seh_endprologue +; WINDOWS-NEXT: movq __security_cookie(%rip), %rax +; WINDOWS-NEXT: xorq %rsp, %rax +; WINDOWS-NEXT: movq %rax, {{[0-9]+}}(%rsp) +; WINDOWS-NEXT: callq callee +; WINDOWS-NEXT: callq callee +; WINDOWS-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; WINDOWS-NEXT: xorq %rsp, %rcx +; WINDOWS-NEXT: cmpq __security_cookie(%rip), %rcx +; WINDOWS-NEXT: jne .LBB2_2 +; WINDOWS-NEXT: # %bb.1: +; WINDOWS-NEXT: .seh_startepilogue +; WINDOWS-NEXT: addq $40, %rsp +; WINDOWS-NEXT: .seh_endepilogue +; WINDOWS-NEXT: retq +; WINDOWS-NEXT: .LBB2_2: +; WINDOWS-NEXT: callq __security_check_cookie +; WINDOWS-NEXT: int3 +; WINDOWS-NEXT: .seh_endproc +; ; LINUX-LABEL: caller: -; LINUX: callq callee@PLT -; LINUX: callq callee@PLT -; LINUX: cmpq -; LINUX: jne -; LINUX: callq __stack_chk_fail@PLT +; LINUX: # %bb.0: +; LINUX-NEXT: pushq %rax +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: movq %fs:40, %rax +; LINUX-NEXT: movq %rax, (%rsp) +; LINUX-NEXT: callq callee@PLT +; LINUX-NEXT: callq callee@PLT +; LINUX-NEXT: movq %fs:40, %rax +; LINUX-NEXT: cmpq (%rsp), %rax +; LINUX-NEXT: jne .LBB2_2 +; LINUX-NEXT: # %bb.1: # %SP_return +; LINUX-NEXT: popq %rax +; LINUX-NEXT: .cfi_def_cfa_offset 8 +; LINUX-NEXT: retq +; LINUX-NEXT: .LBB2_2: # %CallStackCheckFailBlk +; LINUX-NEXT: .cfi_def_cfa_offset 16 +; LINUX-NEXT: callq __stack_chk_fail@PLT + tail call void @callee() call void @callee() From 3e5d50f9c61bb266ab17919ab5209c7b08520aff Mon Sep 17 00:00:00 2001 From: Durgadoss R Date: Thu, 12 Jun 2025 15:20:39 +0530 Subject: [PATCH 205/851] [NVPTX] Add cta_group support to TMA G2S intrinsics (#143178) This patch extends the TMA G2S intrinsics with the support for cta_group::1/2 available from Blackwell onwards. The existing intrinsics are auto-upgraded with a default value of '0' for the `cta_group` flag operand. * lit tests are added for all combinations of the newer variants. * Negative tests are added to validate the error-handling when the value of the cta_group flag falls out-of-range. * The generated PTX is verified with a 12.8 ptxas executable. Signed-off-by: Durgadoss R --- llvm/docs/NVPTXUsage.rst | 32 +- llvm/include/llvm/IR/IntrinsicsNVVM.td | 32 +- llvm/include/llvm/IR/NVVMIntrinsicUtils.h | 9 + llvm/lib/IR/AutoUpgrade.cpp | 104 ++++- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 19 + .../NVPTX/MCTargetDesc/NVPTXInstPrinter.h | 1 + llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 19 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 17 +- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 8 + .../Assembler/auto_upgrade_nvvm_intrinsics.ll | 16 +- .../NVPTX/cp-async-bulk-tensor-g2s-1cta.ll | 435 ++++++++++++++++++ .../NVPTX/cp-async-bulk-tensor-g2s-2cta.ll | 435 ++++++++++++++++++ .../NVPTX/cp-async-bulk-tensor-g2s-invalid.ll | 15 + 13 files changed, 1078 insertions(+), 64 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll create mode 100644 llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst index d51686c0b830c..abd7ca5453645 100644 --- a/llvm/docs/NVPTXUsage.rst +++ b/llvm/docs/NVPTXUsage.rst @@ -1016,7 +1016,7 @@ Syntax: .. code-block:: llvm - declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(..., i32 %d0, i32 %d1, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...) @@ -1034,18 +1034,26 @@ source tensor is preserved at the destination. The dimension of the tensor data ranges from 1d to 5d with the coordinates specified by the ``i32 %d0 ... i32 %d4`` arguments. -* The last two arguments to these intrinsics are boolean flags - indicating support for cache_hint and/or multicast modifiers. - These flag arguments must be compile-time constants. The backend - looks through these flags and lowers the intrinsics appropriately. +* The last three arguments to these intrinsics are flags + indicating support for multicast, cache_hint and cta_group::1/2 + modifiers. These flag arguments must be compile-time constants. + The backend looks through these flags and lowers the intrinsics + appropriately. -* The Nth argument (denoted by ``i1 flag_ch``) when set, indicates +* The argument denoted by ``i1 %flag_ch`` when set, indicates a valid cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint`` variant of the PTX instruction. -* The [N-1]th argument (denoted by ``i1 flag_mc``) when set, indicates - the presence of a multicast mask (``i16 %mc``) and generates the PTX - instruction with the ``.multicast::cluster`` modifier. +* The argument denoted by ``i1 %flag_mc`` when set, indicates + the presence of a multicast mask (``i16 %mc``) and generates + the PTX instruction with the ``.multicast::cluster`` modifier. + +* The argument denoted by ``i32 %flag_cta_group`` takes values within + the range [0, 3) i.e. {0,1,2}. When the value of ``%flag_cta_group`` + is not within the range, it may raise an error from the Verifier. + The default value is '0' with no cta_group modifier in the + instruction. The values of '1' and '2' lower to ``cta_group::1`` + and ``cta_group::2`` variants of the PTX instruction respectively. For more information, refer PTX ISA ``_. @@ -1058,7 +1066,7 @@ Syntax: .. code-block:: llvm - declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch) + declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %dst, ptr addrspace(3) %bar, ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %flag_mc, i1 %flag_ch, i32 %flag_cta_group) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...) declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...) @@ -1074,8 +1082,8 @@ are unrolled into a single dimensional column at the destination. In this mode, the tensor has to be at least three-dimensional. Along with the tensor coordinates, im2col offsets are also specified (denoted by ``i16 im2col0...i16 %im2col2``). The number of im2col offsets is two less -than the number of dimensions of the tensor operation. The last two arguments -to these intrinsics are boolean flags, with the same functionality as described +than the number of dimensions of the tensor operation. The last three arguments +to these intrinsics are flags, with the same functionality as described in the ``tile`` mode intrinsics above. For more information, refer PTX ISA diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td index 8c8e778b57061..4efdff71c0167 100644 --- a/llvm/include/llvm/IR/IntrinsicsNVVM.td +++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td @@ -2020,20 +2020,26 @@ foreach dim = 1...5 in { defvar num_im2col_offsets = !if(is_im2col, !add(dim, -2), 0); defvar im2col_offsets_args = !listsplat(llvm_i16_ty, num_im2col_offsets); + defvar g2s_params = !listconcat( + [llvm_shared_cluster_ptr_ty, // dst_ptr + llvm_shared_ptr_ty, // mbarrier_ptr + llvm_ptr_ty], // tensormap_ptr + tensor_dim_args, // actual tensor dims + im2col_offsets_args, // im2col offsets + [llvm_i16_ty, // cta_mask + llvm_i64_ty]); // cache_hint + defvar g2s_flags = [llvm_i1_ty, // Flag for cta_mask + llvm_i1_ty, // Flag for cache_hint + llvm_i32_ty]; // Flag for cta_group + defvar cta_group_idx = !add( + !size(g2s_params), + !sub(!size(g2s_flags), 1)); + defvar g2s_props = [IntrConvergent, + WriteOnly>, ReadOnly>, + // Allowed values for cta_group are {0,1,2} i.e [0, 3). + Range, 0, 3>]; def int_nvvm_cp_async_bulk_tensor_g2s_ # mode # _ # dim # d : - DefaultAttrsIntrinsicFlags<[], - !listconcat([llvm_shared_cluster_ptr_ty, // dst_shared_cluster_ptr - llvm_shared_ptr_ty, // mbarrier_smem_ptr - llvm_ptr_ty], // tensormap_ptr - tensor_dim_args, // actual tensor dims - im2col_offsets_args, // im2col offsets - [llvm_i16_ty, // cta_mask - llvm_i64_ty]), // cache_hint - [llvm_i1_ty, // Flag for cta_mask - llvm_i1_ty], // Flag for cache_hint - [IntrConvergent, - WriteOnly>, ReadOnly>, - NoCapture>, NoCapture>, NoCapture>]>; + DefaultAttrsIntrinsicFlags<[], g2s_params, g2s_flags, g2s_props>; def int_nvvm_cp_async_bulk_tensor_s2g_ # mode # _ # dim # d : DefaultAttrsIntrinsicFlags<[], diff --git a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h index ce794e2573637..737610b73b081 100644 --- a/llvm/include/llvm/IR/NVVMIntrinsicUtils.h +++ b/llvm/include/llvm/IR/NVVMIntrinsicUtils.h @@ -38,6 +38,15 @@ enum class TMAReductionOp : uint8_t { XOR = 7, }; +// Enum to represent the cta_group::1 and +// cta_group::2 variants in TMA/TCGEN05 family of +// PTX instructions. +enum class CTAGroupKind : uint8_t { + CG_NONE = 0, // default with no cta_group modifier + CG_1 = 1, // cta_group::1 modifier + CG_2 = 2, // cta_group::2 modifier +}; + inline bool FPToIntegerIntrinsicShouldFTZ(Intrinsic::ID IntrinsicID) { switch (IntrinsicID) { case Intrinsic::nvvm_f2i_rm_ftz: diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp index a0886776ff93f..6e7254ec3e31f 100644 --- a/llvm/lib/IR/AutoUpgrade.cpp +++ b/llvm/lib/IR/AutoUpgrade.cpp @@ -945,6 +945,53 @@ static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F, return false; // No other 'arm.*', 'aarch64.*'. } +static Intrinsic::ID shouldUpgradeNVPTXTMAG2SIntrinsics(Function *F, + StringRef Name) { + if (Name.consume_front("cp.async.bulk.tensor.g2s.")) { + Intrinsic::ID ID = + StringSwitch(Name) + .Case("im2col.3d", + Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d) + .Case("im2col.4d", + Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d) + .Case("im2col.5d", + Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d) + .Case("tile.1d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d) + .Case("tile.2d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d) + .Case("tile.3d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d) + .Case("tile.4d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d) + .Case("tile.5d", Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d) + .Default(Intrinsic::not_intrinsic); + + if (ID == Intrinsic::not_intrinsic) + return ID; + + // These intrinsics may need upgrade for two reasons: + // (1) When the address-space of the first argument is shared[AS=3] + // (and we upgrade it to use shared_cluster address-space[AS=7]) + if (F->getArg(0)->getType()->getPointerAddressSpace() == + NVPTXAS::ADDRESS_SPACE_SHARED) + return ID; + + // (2) When there are only two boolean flag arguments at the end: + // + // The last three parameters of the older version of these + // intrinsics are: arg1, arg2, .. i64 ch, i1 mc_flag, i1 ch_flag + // + // The newer version reads as: + // arg1, arg2, .. i64 ch, i1 mc_flag, i1 ch_flag, i32 cta_group_flag + // + // So, when the type of the [N-3]rd argument is "not i1", then + // it is the older version and we need to upgrade. + size_t FlagStartIndex = F->getFunctionType()->getNumParams() - 3; + Type *ArgType = F->getFunctionType()->getParamType(FlagStartIndex); + if (!ArgType->isIntegerTy(1)) + return ID; + } + + return Intrinsic::not_intrinsic; +} + static Intrinsic::ID shouldUpgradeNVPTXSharedClusterIntrinsic(Function *F, StringRef Name) { if (Name.consume_front("mapa.shared.cluster")) @@ -959,22 +1006,6 @@ static Intrinsic::ID shouldUpgradeNVPTXSharedClusterIntrinsic(Function *F, Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster) .Case("shared.cta.to.cluster", Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster) - .Case("tensor.g2s.im2col.3d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d) - .Case("tensor.g2s.im2col.4d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d) - .Case("tensor.g2s.im2col.5d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d) - .Case("tensor.g2s.tile.1d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d) - .Case("tensor.g2s.tile.2d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d) - .Case("tensor.g2s.tile.3d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d) - .Case("tensor.g2s.tile.4d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d) - .Case("tensor.g2s.tile.5d", - Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d) .Default(Intrinsic::not_intrinsic); if (ID != Intrinsic::not_intrinsic) @@ -1339,6 +1370,14 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn, return true; } + // Upgrade TMA copy G2S Intrinsics + IID = shouldUpgradeNVPTXTMAG2SIntrinsics(F, Name); + if (IID != Intrinsic::not_intrinsic) { + rename(F); + NewFn = Intrinsic::getOrInsertDeclaration(F->getParent(), IID); + return true; + } + // The following nvvm intrinsics correspond exactly to an LLVM idiom, but // not to an intrinsic alone. We expand them in UpgradeIntrinsicCall. // @@ -4831,7 +4870,18 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { return; } case Intrinsic::nvvm_cp_async_bulk_global_to_shared_cluster: - case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster: + case Intrinsic::nvvm_cp_async_bulk_shared_cta_to_cluster: { + // Create a new call with the correct address space. + SmallVector Args(CI->args()); + Args[0] = Builder.CreateAddrSpaceCast( + Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER)); + + NewCall = Builder.CreateCall(NewFn, Args); + NewCall->takeName(CI); + CI->replaceAllUsesWith(NewCall); + CI->eraseFromParent(); + return; + } case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d: case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d: case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d: @@ -4840,10 +4890,22 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) { case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d: case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d: case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d: { - // Create a new call with the correct address space. - SmallVector Args(CI->args()); - Args[0] = Builder.CreateAddrSpaceCast( - Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER)); + SmallVector Args(CI->args()); + + // Create AddrSpaceCast to shared_cluster if needed. + // This handles case (1) in shouldUpgradeNVPTXTMAG2SIntrinsics(). + unsigned AS = CI->getArgOperand(0)->getType()->getPointerAddressSpace(); + if (AS == NVPTXAS::ADDRESS_SPACE_SHARED) + Args[0] = Builder.CreateAddrSpaceCast( + Args[0], Builder.getPtrTy(NVPTXAS::ADDRESS_SPACE_SHARED_CLUSTER)); + + // Attach the flag argument for cta_group, with a + // default value of 0. This handles case (2) in + // shouldUpgradeNVPTXTMAG2SIntrinsics(). + size_t NumArgs = CI->arg_size(); + Value *FlagArg = CI->getArgOperand(NumArgs - 3); + if (!FlagArg->getType()->isIntegerTy(1)) + Args.push_back(ConstantInt::get(Builder.getInt32Ty(), 0)); NewCall = Builder.CreateCall(NewFn, Args); NewCall->takeName(CI); diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index b4616b64bad15..732950deca9fa 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -437,3 +437,22 @@ void NVPTXInstPrinter::printTmaReductionMode(const MCInst *MI, int OpNum, llvm_unreachable( "Invalid Reduction Op in printCpAsyncBulkTensorReductionMode"); } + +void NVPTXInstPrinter::printCTAGroup(const MCInst *MI, int OpNum, + raw_ostream &O) { + const MCOperand &MO = MI->getOperand(OpNum); + using CGTy = nvvm::CTAGroupKind; + + switch (static_cast(MO.getImm())) { + case CGTy::CG_NONE: + O << ""; + return; + case CGTy::CG_1: + O << ".cta_group::1"; + return; + case CGTy::CG_2: + O << ".cta_group::2"; + return; + } + llvm_unreachable("Invalid cta_group in printCTAGroup"); +} diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h index a2dd772cd86d0..f73af7a3f2c6e 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.h @@ -51,6 +51,7 @@ class NVPTXInstPrinter : public MCInstPrinter { void printProtoIdent(const MCInst *MI, int OpNum, raw_ostream &O); void printPrmtMode(const MCInst *MI, int OpNum, raw_ostream &O); void printTmaReductionMode(const MCInst *MI, int OpNum, raw_ostream &O); + void printCTAGroup(const MCInst *MI, int OpNum, raw_ostream &O); }; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 32223bf3d601e..a20099788d09c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -2556,19 +2556,25 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N, // We have {Chain, Intrinsic-ID} followed by the actual intrisic args: // {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2} // multicast, cache_hint, - // multicast_flag, cache_hint_flag} + // multicast_flag, cache_hint_flag, cta_group_flag} // NumOperands = {Chain, IID} + {Actual intrinsic args} - // = {2} + {7 + dims + im2col_offsets} + // = {2} + {8 + dims + im2col_offsets} size_t NumOps = N->getNumOperands(); size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1)) - : (NumOps - 9); + : (NumOps - 10); // Offsets is always 'NumDims - 2' and only for im2col mode size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0; - bool IsCacheHint = N->getConstantOperandVal(NumOps - 1) == 1; - bool IsMultiCast = N->getConstantOperandVal(NumOps - 2) == 1; + bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1; + bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1; size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src} size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID + unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1); + if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()) + report_fatal_error( + formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}", + Subtarget->getSmVersion())); + SDLoc DL(N); SmallVector Ops(N->ops().slice(2, NumBaseArgs)); @@ -2580,6 +2586,9 @@ void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N, if (IsCacheHint) Ops.push_back(N->getOperand(MultiCastIdx + 1)); + // Flag for CTA Group + Ops.push_back(getI32Imm(CTAGroupVal, DL)); + // Finally, the chain operand Ops.push_back(N->getOperand(0)); diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 83d7defe6d9a9..f52ff39c3e1a5 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -578,10 +578,14 @@ class G2S_STRINGS { # !if(!eq(mode, "tile"), "_TILE", "_IM2COL"); } +def CTAGroupFlags : Operand { + let PrintMethod = "printCTAGroup"; +} + multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR { defvar dims_dag = !dag(ins, !listsplat(Int32Regs, dim), !foreach(i, !range(dim), "d" # i)); defvar dims_str = !interleave(!foreach(i, !range(dim), "$d" # i), ", "); - defvar asm_str_default = " [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; + defvar asm_str_default = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]"; defvar rc = !if(is_shared32, Int32Regs, Int64Regs); defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0); @@ -595,19 +599,22 @@ multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR !strconcat(asm_str_default, im2col_asm_str), asm_str_default); def "" : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag), + !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)), !strconcat(G2S_STRINGS.inst_name, asm_str, ";"), []>, Requires<[hasPTX<80>, hasSM<90>]>; def _MC : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc)), + !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, + (ins Int16Regs:$mc, CTAGroupFlags:$cg)), !strconcat(G2S_STRINGS.inst_name, asm_str, ", $mc;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; def _CH : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int64Regs:$ch)), + !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, + (ins Int64Regs:$ch, CTAGroupFlags:$cg)), !strconcat(G2S_STRINGS.inst_name, asm_str, ", $ch;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; def _MC_CH : NVPTXInst<(outs), - !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, (ins Int16Regs:$mc, Int64Regs:$ch)), + !con((ins rc:$dst, rc:$mbar, Int64Regs:$tmap), dims_dag, im2col_dag, + (ins Int16Regs:$mc, Int64Regs:$ch, CTAGroupFlags:$cg)), !strconcat(G2S_STRINGS.inst_name, asm_str, ", $mc, $ch;"), []>, Requires<[hasPTX<80>, hasSM<90>]>; } diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 5136b1ee28502..d2eae48826829 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -117,6 +117,14 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { return HasTcgen05 && PTXVersion >= 86; } + // TMA G2S copy with cta_group::1/2 support + bool hasCpAsyncBulkTensorCTAGroupSupport() const { + // TODO: Update/tidy-up after the family-conditional support arrives + return ((FullSmVersion == 1001 || FullSmVersion == 1011) && + PTXVersion >= 86) || + (FullSmVersion == 1031 && PTXVersion >= 88); + } + // Prior to CUDA 12.3 ptxas did not recognize that the trap instruction // terminates a basic block. Instead, it would assume that control flow // continued to the next instruction. The next instruction could be in the diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll index b7bdca42d5596..a17f11a680aa2 100644 --- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll +++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll @@ -307,9 +307,9 @@ define void @nvvm_cp_async_bulk_intrinsics(ptr addrspace(3) %dst, ptr addrspace( ; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_im2col define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) { -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 false, i1 false) -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 false, i1 false) -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 false, i1 false) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 false, i1 false, i32 0) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 false, i1 false, i32 0) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 false, i1 false, i32 0) call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 0, i64 0, i1 0, i1 0) call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 0, i64 0, i1 0, i1 0) call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 0, i64 0, i1 0, i1 0) @@ -318,11 +318,11 @@ define void @nvvm_cp_async_bulk_tensor_g2s_im2col(ptr addrspace(3) %d, ptr addrs ; CHECK-LABEL: @nvvm_cp_async_bulk_tensor_g2s_tile define void @nvvm_cp_async_bulk_tensor_g2s_tile(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 false, i1 false) -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 false, i1 false) -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 false, i1 false) -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 0, i64 0, i1 false, i1 false) -; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %5, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 0, i64 0, i1 false, i1 false) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %1, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 false, i1 false, i32 0) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %2, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 false, i1 false, i32 0) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %3, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 false, i1 false, i32 0) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %4, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 0, i64 0, i1 false, i1 false, i32 0) +; CHECK: call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %5, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 0, i64 0, i1 false, i1 false, i32 0) call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 0, i64 0, i1 0, i1 0) call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 0, i64 0, i1 0, i1 0) call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(3) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 0, i64 0, i1 0, i1 0) diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll new file mode 100644 index 0000000000000..5cfa25dfe55fc --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll @@ -0,0 +1,435 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d +define void @test_cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d +define void @test_cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<3>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d +define void @test_cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d +define void @test_cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d +define void @test_cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d +define void @test_cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d +define void @test_cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d +define void @test_cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<5>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::1 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 0, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 1, i32 1) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 0, i32 1) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll new file mode 100644 index 0000000000000..a7e6bec6aef10 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll @@ -0,0 +1,435 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %} +; RUN: %if ptxas-12.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %} + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tm, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 %f1, i1 %f2, i32 %f3); + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d +define void @test_cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<2>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_1d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_1d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_1d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_1d_param_5]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.1d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d +define void @test_cp_async_bulk_tensor_g2s_tile_2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<3>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_2d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_2d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_2d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_2d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_2d_param_6]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.2d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.2d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d +define void @test_cp_async_bulk_tensor_g2s_tile_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d +define void @test_cp_async_bulk_tensor_g2s_tile_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d +define void @test_cp_async_bulk_tensor_g2s_tile_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rs1; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2]; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_tile_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<2>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_tile_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_tile_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_tile_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_tile_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_tile_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_tile_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_tile_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rs1; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.tile.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2]; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d +define void @test_cp_async_bulk_tensor_g2s_im2col_3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX64-NEXT: .reg .b32 %r<4>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rs2; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3}], [%rd2], {%rs1}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_3d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<3>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_3d_param_8]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rs2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.3d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5}], [%r2], {%rs1}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.3d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d +define void @test_cp_async_bulk_tensor_g2s_im2col_4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX64-NEXT: .reg .b32 %r<5>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rs3; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4}], [%rd2], {%rs1, %rs2}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_4d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<4>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<7>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_4d_param_10]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rs3; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6}], [%r2], {%rs1, %rs2}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.4d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} + +; CHECK-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d +define void @test_cp_async_bulk_tensor_g2s_im2col_5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch) { +; CHECK-PTX64-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d( +; CHECK-PTX64: { +; CHECK-PTX64-NEXT: .reg .b16 %rs<5>; +; CHECK-PTX64-NEXT: .reg .b32 %r<6>; +; CHECK-PTX64-NEXT: .reg .b64 %rd<5>; +; CHECK-PTX64-EMPTY: +; CHECK-PTX64-NEXT: // %bb.0: +; CHECK-PTX64-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2]; +; CHECK-PTX64-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3]; +; CHECK-PTX64-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4]; +; CHECK-PTX64-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5]; +; CHECK-PTX64-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6]; +; CHECK-PTX64-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10]; +; CHECK-PTX64-NEXT: ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11]; +; CHECK-PTX64-NEXT: ld.param.b64 %rd4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rs4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}, %rd4; +; CHECK-PTX64-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%rd1], [%rd3, {%r1, %r2, %r3, %r4, %r5}], [%rd2], {%rs1, %rs2, %rs3}; +; CHECK-PTX64-NEXT: ret; +; +; CHECK-PTX-SHARED32-LABEL: test_cp_async_bulk_tensor_g2s_im2col_5d( +; CHECK-PTX-SHARED32: { +; CHECK-PTX-SHARED32-NEXT: .reg .b16 %rs<5>; +; CHECK-PTX-SHARED32-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-SHARED32-NEXT: .reg .b64 %rd<3>; +; CHECK-PTX-SHARED32-EMPTY: +; CHECK-PTX-SHARED32-NEXT: // %bb.0: +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_0]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_1]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_2]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_3]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_4]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r5, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_5]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r6, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_6]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b32 %r7, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_7]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs1, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_8]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_9]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs3, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_10]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b16 %rs4, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_11]; +; CHECK-PTX-SHARED32-NEXT: ld.param.b64 %rd2, [test_cp_async_bulk_tensor_g2s_im2col_5d_param_12]; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.multicast::cluster.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rs4; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.L2::cache_hint.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}, %rd2; +; CHECK-PTX-SHARED32-NEXT: cp.async.bulk.tensor.5d.shared::cluster.global.im2col.mbarrier::complete_tx::bytes.cta_group::2 [%r1], [%rd1, {%r3, %r4, %r5, %r6, %r7}], [%r2], {%rs1, %rs2, %rs3}; +; CHECK-PTX-SHARED32-NEXT: ret; + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 1, i1 0, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 1, i32 2) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.im2col.5d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, i16 %mc, i64 %ch, i1 0, i1 0, i32 2) + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll new file mode 100644 index 0000000000000..1c35fbead389e --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-invalid.ll @@ -0,0 +1,15 @@ +; RUN: not llc < %s -mtriple=nvptx64 -mcpu=sm_100a -o /dev/null 2>&1 | FileCheck %s + +target triple = "nvptx64-nvidia-cuda" + +declare void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) writeonly, ptr addrspace(3), ptr readonly, i32, i16, i64, i1 immarg, i1 immarg, i32 immarg range(i32 0, 3)) + +define void @test_cp_async_bulk_tensor_g2s_tile_1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch) { + ; CHECK: immarg value 3 out of range [0, 3) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 3) + + ; CHECK: immarg value -1 out of range [0, 3) + tail call void @llvm.nvvm.cp.async.bulk.tensor.g2s.tile.1d(ptr addrspace(7) %d, ptr addrspace(3) %bar, ptr %tmap, i32 %d0, i16 %mc, i64 %ch, i1 0, i1 0, i32 -1) + + ret void +} From a8c6fb4cb8e686f733e022afc549bc085d1558f4 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Thu, 12 Jun 2025 11:53:32 +0200 Subject: [PATCH 206/851] [MemCpyOpt] Fix lifetime marker sizes in tests (NFC) As pointed out in https://github.com/llvm/llvm-project/pull/143782, these tests were specifying the size in bits instead of bytes. In order to preserve the intent of the tests, add a use of %src, which prevents stack-move optimization. These are supposed to test the handling of scoped alias metadata in call slot optimization. --- .../test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll | 7 +++++-- llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll | 9 ++++++--- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll index 989049ab67a0b..840a5172561dc 100644 --- a/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll +++ b/llvm/test/Analysis/ScopedNoAliasAA/alias-scope-merging.ll @@ -1,17 +1,20 @@ ; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s +declare void @use(ptr) + ; Alias scopes are merged by taking the intersection of domains, then the union of the scopes within those domains define i8 @test(i8 %input) { %tmp = alloca i8 %dst = alloca i8 %src = alloca i8 ; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope ![[SCOPE:[0-9]+]] - call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !4 + call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !4 store i8 %input, ptr %src call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0 - call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !4 + call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !4 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 1, i1 false), !alias.scope !4 %ret_value = load i8, ptr %dst + call void @use(ptr %src) ret i8 %ret_value } diff --git a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll index efdbdce401b76..601498e36a7a3 100644 --- a/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll +++ b/llvm/test/Transforms/MemCpyOpt/callslot_badaa.ll @@ -1,9 +1,11 @@ ; RUN: opt < %s -S -passes=memcpyopt | FileCheck --match-full-lines %s +declare void @use(ptr) + ; Make sure callslot optimization merges alias.scope metadata correctly when it merges instructions. ; Merging here naively generates: ; call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false), !alias.scope !3 -; call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !0 +; call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !0 ; ... ; !0 = !{!1} ; !1 = distinct !{!1, !2, !"callee1: %a"} @@ -18,12 +20,13 @@ define i8 @test(i8 %input) { %src = alloca i8 ; NOTE: we're matching the full line and looking for the lack of !alias.scope here ; CHECK: call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %src, i64 1, i1 false) - call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %src), !noalias !3 + call void @llvm.lifetime.start.p0(i64 1, ptr nonnull %src), !noalias !3 store i8 %input, ptr %src call void @llvm.memcpy.p0.p0.i64(ptr align 8 %tmp, ptr align 8 %src, i64 1, i1 false), !alias.scope !0 - call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %src), !noalias !3 + call void @llvm.lifetime.end.p0(i64 1, ptr nonnull %src), !noalias !3 call void @llvm.memcpy.p0.p0.i64(ptr align 8 %dst, ptr align 8 %tmp, i64 1, i1 false), !alias.scope !3 %ret_value = load i8, ptr %dst + call void @use(ptr %src) ret i8 %ret_value } From 5987f1ee5cc59a05961156c04010ab0f3c857628 Mon Sep 17 00:00:00 2001 From: Antonio Frighetto Date: Thu, 12 Jun 2025 11:52:28 +0200 Subject: [PATCH 207/851] [InstCombine] Regenerate `narrow-switch.ll` test (NFC) `narrow-switch.ll` test has been regenerated via latest UTC using `--prefix-filecheck-ir-name _`, so as to avoid conflicts with scripted variable names. --- .../Transforms/InstCombine/narrow-switch.ll | 194 +++++++++++++----- 1 file changed, 148 insertions(+), 46 deletions(-) diff --git a/llvm/test/Transforms/InstCombine/narrow-switch.ll b/llvm/test/Transforms/InstCombine/narrow-switch.ll index 05a30b910e5ee..90f56a61fa410 100644 --- a/llvm/test/Transforms/InstCombine/narrow-switch.ll +++ b/llvm/test/Transforms/InstCombine/narrow-switch.ll @@ -1,15 +1,27 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --prefix-filecheck-ir-name _ --version 5 ; Vary legal integer types in data layout. ; RUN: opt < %s -passes=instcombine -S -data-layout=n32 | FileCheck %s --check-prefix=ALL --check-prefix=CHECK32 ; RUN: opt < %s -passes=instcombine -S -data-layout=n32:64 | FileCheck %s --check-prefix=ALL --check-prefix=CHECK64 define i32 @positive1(i64 %a) { -; ALL-LABEL: @positive1( -; ALL: switch i32 -; ALL-NEXT: i32 10, label %return -; ALL-NEXT: i32 100, label %sw.bb1 -; ALL-NEXT: i32 1001, label %sw.bb2 +; ALL-LABEL: define i32 @positive1( +; ALL-SAME: i64 [[A:%.*]]) { +; ALL-NEXT: [[ENTRY:.*]]: +; ALL-NEXT: [[TRUNC:%.*]] = trunc i64 [[A]] to i32 +; ALL-NEXT: switch i32 [[TRUNC]], label %[[SW_DEFAULT:.*]] [ +; ALL-NEXT: i32 10, label %[[RETURN:.*]] +; ALL-NEXT: i32 100, label %[[SW_BB1:.*]] +; ALL-NEXT: i32 1001, label %[[SW_BB2:.*]] ; ALL-NEXT: ] +; ALL: [[SW_BB1]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_BB2]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_DEFAULT]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[RETURN]]: +; ALL-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 24, %[[SW_DEFAULT]] ], [ 123, %[[SW_BB2]] ], [ 213, %[[SW_BB1]] ], [ 231, %[[ENTRY]] ] +; ALL-NEXT: ret i32 [[RETVAL_0]] ; entry: %and = and i64 %a, 4294967295 @@ -34,12 +46,24 @@ return: } define i32 @negative1(i64 %a) { -; ALL-LABEL: @negative1( -; ALL: switch i32 -; ALL-NEXT: i32 -10, label %return -; ALL-NEXT: i32 -100, label %sw.bb1 -; ALL-NEXT: i32 -1001, label %sw.bb2 +; ALL-LABEL: define i32 @negative1( +; ALL-SAME: i64 [[A:%.*]]) { +; ALL-NEXT: [[ENTRY:.*]]: +; ALL-NEXT: [[TRUNC:%.*]] = trunc i64 [[A]] to i32 +; ALL-NEXT: switch i32 [[TRUNC]], label %[[SW_DEFAULT:.*]] [ +; ALL-NEXT: i32 -10, label %[[RETURN:.*]] +; ALL-NEXT: i32 -100, label %[[SW_BB1:.*]] +; ALL-NEXT: i32 -1001, label %[[SW_BB2:.*]] ; ALL-NEXT: ] +; ALL: [[SW_BB1]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_BB2]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_DEFAULT]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[RETURN]]: +; ALL-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 24, %[[SW_DEFAULT]] ], [ 123, %[[SW_BB2]] ], [ 213, %[[SW_BB1]] ], [ 231, %[[ENTRY]] ] +; ALL-NEXT: ret i32 [[RETVAL_0]] ; entry: %or = or i64 %a, -4294967296 @@ -67,12 +91,24 @@ return: ; assertion. define i32 @trunc72to68(i72 %a) { -; ALL-LABEL: @trunc72to68( -; ALL: switch i68 -; ALL-NEXT: i68 10, label %return -; ALL-NEXT: i68 100, label %sw.bb1 -; ALL-NEXT: i68 1001, label %sw.bb2 +; ALL-LABEL: define i32 @trunc72to68( +; ALL-SAME: i72 [[A:%.*]]) { +; ALL-NEXT: [[ENTRY:.*]]: +; ALL-NEXT: [[TRUNC:%.*]] = trunc i72 [[A]] to i68 +; ALL-NEXT: switch i68 [[TRUNC]], label %[[SW_DEFAULT:.*]] [ +; ALL-NEXT: i68 10, label %[[RETURN:.*]] +; ALL-NEXT: i68 100, label %[[SW_BB1:.*]] +; ALL-NEXT: i68 1001, label %[[SW_BB2:.*]] ; ALL-NEXT: ] +; ALL: [[SW_BB1]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_BB2]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_DEFAULT]]: +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[RETURN]]: +; ALL-NEXT: [[RETVAL_0:%.*]] = phi i32 [ 24, %[[SW_DEFAULT]] ], [ 123, %[[SW_BB2]] ], [ 213, %[[SW_BB1]] ], [ 231, %[[ENTRY]] ] +; ALL-NEXT: ret i32 [[RETVAL_0]] ; entry: %and = and i72 %a, 295147905179352825855 @@ -103,15 +139,38 @@ return: ; because both are illegal. define void @trunc64to58(i64 %a) { -; ALL-LABEL: @trunc64to58( -; CHECK32: switch i58 -; CHECK32-NEXT: i58 0, label %sw.bb1 -; CHECK32-NEXT: i58 18717182647723699, label %sw.bb2 +; CHECK32-LABEL: define void @trunc64to58( +; CHECK32-SAME: i64 [[A:%.*]]) { +; CHECK32-NEXT: [[ENTRY:.*:]] +; CHECK32-NEXT: [[TMP0:%.*]] = trunc i64 [[A]] to i58 +; CHECK32-NEXT: [[TMP1:%.*]] = and i58 [[TMP0]], 15 +; CHECK32-NEXT: [[TRUNC:%.*]] = mul nuw i58 [[TMP1]], 18717182647723699 +; CHECK32-NEXT: switch i58 [[TRUNC]], label %[[SW_DEFAULT:.*]] [ +; CHECK32-NEXT: i58 0, label %[[SW_BB1:.*]] +; CHECK32-NEXT: i58 18717182647723699, label %[[SW_BB2:.*]] ; CHECK32-NEXT: ] -; CHECK64: switch i64 -; CHECK64-NEXT: i64 0, label %sw.bb1 -; CHECK64-NEXT: i64 18717182647723699, label %sw.bb2 +; CHECK32: [[SW_BB1]]: +; CHECK32-NEXT: br label %[[SW_DEFAULT]] +; CHECK32: [[SW_BB2]]: +; CHECK32-NEXT: br label %[[SW_DEFAULT]] +; CHECK32: [[SW_DEFAULT]]: +; CHECK32-NEXT: ret void +; +; CHECK64-LABEL: define void @trunc64to58( +; CHECK64-SAME: i64 [[A:%.*]]) { +; CHECK64-NEXT: [[ENTRY:.*:]] +; CHECK64-NEXT: [[_TMP0:%.*]] = and i64 [[A]], 15 +; CHECK64-NEXT: [[TMP0:%.*]] = mul nuw nsw i64 [[_TMP0]], 18717182647723699 +; CHECK64-NEXT: switch i64 [[TMP0]], label %[[SW_DEFAULT:.*]] [ +; CHECK64-NEXT: i64 0, label %[[SW_BB1:.*]] +; CHECK64-NEXT: i64 18717182647723699, label %[[SW_BB2:.*]] ; CHECK64-NEXT: ] +; CHECK64: [[SW_BB1]]: +; CHECK64-NEXT: br label %[[SW_DEFAULT]] +; CHECK64: [[SW_BB2]]: +; CHECK64-NEXT: br label %[[SW_DEFAULT]] +; CHECK64: [[SW_DEFAULT]]: +; CHECK64-NEXT: ret void ; entry: %tmp0 = and i64 %a, 15 @@ -136,18 +195,19 @@ sw.default: ; https://llvm.org/bugs/show_bug.cgi?id=31260 define i8 @PR31260(i8 %x) { -; ALL-LABEL: @PR31260( -; ALL-NEXT: entry: -; ALL-NEXT: [[T4:%.*]] = and i8 [[X:%.*]], 2 -; ALL-NEXT: switch i8 [[T4]], label [[EXIT:%.*]] [ -; ALL-NEXT: i8 0, label [[CASE126:%.*]] -; ALL-NEXT: i8 2, label [[CASE124:%.*]] +; ALL-LABEL: define i8 @PR31260( +; ALL-SAME: i8 [[X:%.*]]) { +; ALL-NEXT: [[ENTRY:.*:]] +; ALL-NEXT: [[T4:%.*]] = and i8 [[X]], 2 +; ALL-NEXT: switch i8 [[T4]], label %[[EXIT:.*]] [ +; ALL-NEXT: i8 0, label %[[CASE126:.*]] +; ALL-NEXT: i8 2, label %[[CASE124:.*]] ; ALL-NEXT: ] -; ALL: exit: +; ALL: [[EXIT]]: ; ALL-NEXT: ret i8 1 -; ALL: case126: +; ALL: [[CASE126]]: ; ALL-NEXT: ret i8 3 -; ALL: case124: +; ALL: [[CASE124]]: ; ALL-NEXT: ret i8 5 ; entry: @@ -169,12 +229,33 @@ case124: ; Make sure the arithmetic evaluation of the switch ; condition is evaluated on the original type define i32 @trunc32to16(i32 %a0) #0 { -; ALL-LABEL: @trunc32to16( -; ALL: switch i16 -; ALL-NEXT: i16 63, label %sw.bb -; ALL-NEXT: i16 1, label %sw.bb1 -; ALL-NEXT: i16 100, label %sw.bb2 +; ALL-LABEL: define i32 @trunc32to16( +; ALL-SAME: i32 [[A0:%.*]]) { +; ALL-NEXT: [[ENTRY:.*:]] +; ALL-NEXT: [[RETVAL:%.*]] = alloca i32, align 4 +; ALL-NEXT: [[XOR:%.*]] = lshr i32 [[A0]], 16 +; ALL-NEXT: [[TMP0:%.*]] = trunc nuw i32 [[XOR]] to i16 +; ALL-NEXT: [[TRUNC:%.*]] = xor i16 [[TMP0]], 15784 +; ALL-NEXT: switch i16 [[TRUNC]], label %[[SW_EPILOG:.*]] [ +; ALL-NEXT: i16 63, label %[[SW_BB:.*]] +; ALL-NEXT: i16 1, label %[[SW_BB1:.*]] +; ALL-NEXT: i16 100, label %[[SW_BB2:.*]] ; ALL-NEXT: ] +; ALL: [[SW_BB]]: +; ALL-NEXT: store i32 90, ptr [[RETVAL]], align 4 +; ALL-NEXT: br label %[[RETURN:.*]] +; ALL: [[SW_BB1]]: +; ALL-NEXT: store i32 91, ptr [[RETVAL]], align 4 +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_BB2]]: +; ALL-NEXT: store i32 92, ptr [[RETVAL]], align 4 +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[SW_EPILOG]]: +; ALL-NEXT: store i32 113, ptr [[RETVAL]], align 4 +; ALL-NEXT: br label %[[RETURN]] +; ALL: [[RETURN]]: +; ALL-NEXT: [[RVAL:%.*]] = load i32, ptr [[RETVAL]], align 4 +; ALL-NEXT: ret i32 [[RVAL]] ; entry: %retval = alloca i32, align 4 @@ -182,9 +263,9 @@ entry: %shr = lshr i32 %xor, 16 %add = add i32 %shr, -917677090 switch i32 %add, label %sw.epilog [ - i32 -917677027, label %sw.bb - i32 -917677089, label %sw.bb1 - i32 -917676990, label %sw.bb2 + i32 -917677027, label %sw.bb + i32 -917677089, label %sw.bb1 + i32 -917676990, label %sw.bb2 ] sw.bb: ; preds = %entry @@ -219,11 +300,32 @@ declare i32 @goo() ; if original type is legal (i32 in this case) define void @PR29009() { -; ALL-LABEL: @PR29009( -; ALL: switch i32 -; ALL-NEXT: i32 0, label -; ALL-NEXT: i32 3, label +; ALL-LABEL: define void @PR29009() { +; ALL-NEXT: br label %[[BB1:.*]] +; ALL: [[BB1]]: +; ALL-NEXT: [[TMP2:%.*]] = load volatile i32, ptr @njob, align 4 +; ALL-NEXT: [[DOTNOT:%.*]] = icmp eq i32 [[TMP2]], 0 +; ALL-NEXT: br i1 [[DOTNOT]], label %[[BB10:.*]], label %[[BB3:.*]] +; ALL: [[BB3]]: +; ALL-NEXT: [[TMP4:%.*]] = call i32 @goo() +; ALL-NEXT: [[TMP5:%.*]] = and i32 [[TMP4]], 7 +; ALL-NEXT: switch i32 [[TMP5]], label %[[BB6:.*]] [ +; ALL-NEXT: i32 0, label %[[BB7:.*]] +; ALL-NEXT: i32 3, label %[[BB8:.*]] ; ALL-NEXT: ] +; ALL: [[BB6]]: +; ALL-NEXT: store i32 6, ptr @a, align 4 +; ALL-NEXT: br label %[[BB9:.*]] +; ALL: [[BB7]]: +; ALL-NEXT: store i32 1, ptr @a, align 4 +; ALL-NEXT: br label %[[BB9]] +; ALL: [[BB8]]: +; ALL-NEXT: store i32 2, ptr @a, align 4 +; ALL-NEXT: br label %[[BB9]] +; ALL: [[BB9]]: +; ALL-NEXT: br label %[[BB1]] +; ALL: [[BB10]]: +; ALL-NEXT: ret void ; br label %1 @@ -236,8 +338,8 @@ define void @PR29009() { %5 = call i32 @goo() %6 = and i32 %5, 7 switch i32 %6, label %7 [ - i32 0, label %8 - i32 3, label %9 + i32 0, label %8 + i32 3, label %9 ] ; \n" + "#include_next \\ \n" + "\n" + "#__include_macros\\ \n" + "\n" + "#import \\ \t\n" + "\n" + "@import \\\t \n" + "A;\n" + "#pragma clang \\ \n" + "module \\ \n" + "import A\n" + "#pragma \\ \n" + "push_macro(A)\n" + "#pragma \\\t \n" + "pop_macro(A)\n" + "#pragma \\ \n" + "include_alias(,\\ \n" + ")\n" + "export \\ \n" + "module m;\n" + "import\t\\\t \n" + "m;\n" + "#pragma\t\\ \n" + "clang\t\\ \t\n" + "system_header\n"; + ASSERT_FALSE( + minimizeSourceToDependencyDirectives(Input, Out, Tokens, Directives)); + + EXPECT_EQ(pp_define, Directives[0].Kind); + EXPECT_EQ(pp_undef, Directives[1].Kind); + EXPECT_EQ(pp_endif, Directives[2].Kind); + EXPECT_EQ(pp_if, Directives[3].Kind); + EXPECT_EQ(pp_ifdef, Directives[4].Kind); + EXPECT_EQ(pp_ifndef, Directives[5].Kind); + EXPECT_EQ(pp_elifdef, Directives[6].Kind); + EXPECT_EQ(pp_elifndef, Directives[7].Kind); + EXPECT_EQ(pp_elif, Directives[8].Kind); + EXPECT_EQ(pp_else, Directives[9].Kind); + EXPECT_EQ(pp_include, Directives[10].Kind); + EXPECT_EQ(pp_include_next, Directives[11].Kind); + EXPECT_EQ(pp___include_macros, Directives[12].Kind); + EXPECT_EQ(pp_import, Directives[13].Kind); + EXPECT_EQ(decl_at_import, Directives[14].Kind); + EXPECT_EQ(pp_pragma_import, Directives[15].Kind); + EXPECT_EQ(pp_pragma_push_macro, Directives[16].Kind); + EXPECT_EQ(pp_pragma_pop_macro, Directives[17].Kind); + EXPECT_EQ(pp_pragma_include_alias, Directives[18].Kind); + EXPECT_EQ(cxx_export_module_decl, Directives[19].Kind); + EXPECT_EQ(cxx_import_decl, Directives[20].Kind); + EXPECT_EQ(pp_pragma_system_header, Directives[21].Kind); + EXPECT_EQ(pp_eof, Directives[22].Kind); +} + TEST(MinimizeSourceToDependencyDirectivesTest, PoundWarningAndError) { SmallVector Out; From 92a116c4ef822950f8c57eaa5164c844c73a1f7e Mon Sep 17 00:00:00 2001 From: Alexey Samsonov Date: Fri, 13 Jun 2025 10:48:34 -0700 Subject: [PATCH 416/851] Revert "Fix/reapply "[libc] Migrate stdio tests to ErrnoCheckingTest."" (#144129) Reverts llvm/llvm-project#143972 - matcher seems to be pedantic for fgets tests, reverting to verify and fix. --- libc/test/src/stdio/CMakeLists.txt | 13 ------------ libc/test/src/stdio/fdopen_test.cpp | 14 +++++++------ libc/test/src/stdio/fgetc_test.cpp | 22 +++++++++----------- libc/test/src/stdio/fgetc_unlocked_test.cpp | 22 +++++++++----------- libc/test/src/stdio/fgets_test.cpp | 18 +++++++--------- libc/test/src/stdio/fileop_test.cpp | 20 +++++++++++++----- libc/test/src/stdio/fopencookie_test.cpp | 15 ++++++------- libc/test/src/stdio/remove_test.cpp | 10 ++++----- libc/test/src/stdio/rename_test.cpp | 9 ++++---- libc/test/src/stdio/setvbuf_test.cpp | 9 ++++---- libc/test/src/stdio/unlocked_fileop_test.cpp | 7 ++++--- libc/test/src/stdlib/StrtolTest.h | 1 + libc/test/src/stdlib/strtold_test.cpp | 1 + 13 files changed, 77 insertions(+), 84 deletions(-) diff --git a/libc/test/src/stdio/CMakeLists.txt b/libc/test/src/stdio/CMakeLists.txt index 4aa8b95880018..ce2171f19597b 100644 --- a/libc/test/src/stdio/CMakeLists.txt +++ b/libc/test/src/stdio/CMakeLists.txt @@ -20,7 +20,6 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fseek libc.src.stdio.fwrite - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -69,7 +68,6 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fwrite libc.src.stdio.setvbuf - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -90,7 +88,6 @@ add_libc_test( libc.src.stdio.fread_unlocked libc.src.stdio.funlockfile libc.src.stdio.fwrite_unlocked - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -112,7 +109,6 @@ add_libc_test( libc.src.stdio.fread libc.src.stdio.fseek libc.src.stdio.fwrite - libc.test.UnitTest.ErrnoCheckingTest LINK_LIBRARIES LibcMemoryHelpers ) @@ -442,7 +438,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.sys.stat.mkdirat libc.src.unistd.access libc.src.unistd.close - libc.test.UnitTest.ErrnoCheckingTest ) add_libc_test( @@ -457,7 +452,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.stdio.rename libc.src.unistd.access libc.src.unistd.close - libc.test.UnitTest.ErrnoCheckingTest libc.test.UnitTest.ErrnoSetterMatcher ) @@ -474,7 +468,6 @@ if(${LIBC_TARGET_OS} STREQUAL "linux") libc.src.stdio.fgets libc.src.stdio.fputs libc.src.unistd.close - libc.test.UnitTest.ErrnoCheckingTest libc.test.UnitTest.ErrnoSetterMatcher ) endif() @@ -495,8 +488,6 @@ add_libc_test( libc.src.stdio.fopen libc.src.stdio.fwrite libc.src.stdio.getc - libc.test.UnitTest.ErrnoCheckingTest - libc.test.UnitTest.ErrnoSetterMatcher ) add_libc_test( @@ -519,8 +510,6 @@ add_libc_test( libc.src.stdio.funlockfile libc.src.stdio.fwrite libc.src.stdio.getc_unlocked - libc.test.UnitTest.ErrnoCheckingTest - libc.test.UnitTest.ErrnoSetterMatcher ) add_libc_test( @@ -538,8 +527,6 @@ add_libc_test( libc.src.stdio.fgets libc.src.stdio.fopen libc.src.stdio.fwrite - libc.test.UnitTest.ErrnoCheckingTest - libc.test.UnitTest.ErrnoSetterMatcher ) add_libc_test( diff --git a/libc/test/src/stdio/fdopen_test.cpp b/libc/test/src/stdio/fdopen_test.cpp index b53184c30be36..104fc478b100e 100644 --- a/libc/test/src/stdio/fdopen_test.cpp +++ b/libc/test/src/stdio/fdopen_test.cpp @@ -9,21 +9,20 @@ #include "src/stdio/fdopen.h" #include "hdr/fcntl_macros.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/fclose.h" #include "src/stdio/fgets.h" #include "src/stdio/fputs.h" #include "src/unistd/close.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include // For S_IRWXU -using LlvmLibcStdioFdopenTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - -TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) { +TEST(LlvmLibcStdioFdopenTest, WriteAppendRead) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/write_read_append.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC | O_RDWR, S_IRWXU); @@ -53,7 +52,8 @@ TEST_F(LlvmLibcStdioFdopenTest, WriteAppendRead) { ASSERT_ERRNO_SUCCESS(); } -TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) { +TEST(LlvmLibcStdioFdopenTest, InvalidFd) { + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/invalid_fd.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_TRUNC); @@ -64,7 +64,8 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidFd) { ASSERT_TRUE(nullptr == fp); } -TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) { +TEST(LlvmLibcStdioFdopenTest, InvalidMode) { + libc_errno = 0; constexpr const char *TEST_FILE_NAME = "testdata/invalid_mode.test"; auto TEST_FILE = libc_make_test_file_path(TEST_FILE_NAME); int fd = LIBC_NAMESPACE::open(TEST_FILE, O_CREAT | O_RDONLY, S_IRWXU); @@ -82,6 +83,7 @@ TEST_F(LlvmLibcStdioFdopenTest, InvalidMode) { auto *fp2 = LIBC_NAMESPACE::fdopen(fd, "w"); ASSERT_ERRNO_EQ(EINVAL); ASSERT_TRUE(nullptr == fp2); + libc_errno = 0; LIBC_NAMESPACE::close(fd); ASSERT_ERRNO_SUCCESS(); } diff --git a/libc/test/src/stdio/fgetc_test.cpp b/libc/test/src/stdio/fgetc_test.cpp index be2e50271b510..56bde5f0099a8 100644 --- a/libc/test/src/stdio/fgetc_test.cpp +++ b/libc/test/src/stdio/fgetc_test.cpp @@ -14,15 +14,12 @@ #include "src/stdio/fopen.h" #include "src/stdio/fwrite.h" #include "src/stdio/getc.h" -#include "test/UnitTest/ErrnoCheckingTest.h" -#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" +#include "src/__support/libc_errno.h" -using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher; - -class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { +class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { public: using GetcFunc = int(FILE *); void test_with_func(GetcFunc *func, const char *filename) { @@ -30,28 +27,29 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { ASSERT_FALSE(file == nullptr); constexpr char CONTENT[] = "123456789"; constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1; - ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file), - Succeeds(WRITE_SIZE)); + ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file)); // This is a write-only file so reads should fail. - ASSERT_THAT(func(file), Fails(EBADF, EOF)); + ASSERT_EQ(func(file), EOF); // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; - ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds()); + ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); file = LIBC_NAMESPACE::fopen(filename, "r"); ASSERT_FALSE(file == nullptr); for (size_t i = 0; i < WRITE_SIZE; ++i) { - ASSERT_THAT(func(file), Succeeds(int('1' + i))); + int c = func(file); + ASSERT_EQ(c, int('1' + i)); } // Reading more should return EOF but not set error. - ASSERT_THAT(func(file), Succeeds(EOF)); + ASSERT_EQ(func(file), EOF); ASSERT_NE(LIBC_NAMESPACE::feof(file), 0); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); - ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds()); + ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); } }; diff --git a/libc/test/src/stdio/fgetc_unlocked_test.cpp b/libc/test/src/stdio/fgetc_unlocked_test.cpp index bef9dafd3d87c..90429ecf4e82b 100644 --- a/libc/test/src/stdio/fgetc_unlocked_test.cpp +++ b/libc/test/src/stdio/fgetc_unlocked_test.cpp @@ -17,15 +17,12 @@ #include "src/stdio/funlockfile.h" #include "src/stdio/fwrite.h" #include "src/stdio/getc_unlocked.h" -#include "test/UnitTest/ErrnoCheckingTest.h" -#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" +#include "src/__support/libc_errno.h" -using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher; - -class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { +class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::Test { public: using GetcFunc = int(FILE *); void test_with_func(GetcFunc *func, const char *filename) { @@ -33,30 +30,31 @@ class LlvmLibcGetcTest : public LIBC_NAMESPACE::testing::ErrnoCheckingTest { ASSERT_FALSE(file == nullptr); constexpr char CONTENT[] = "123456789"; constexpr size_t WRITE_SIZE = sizeof(CONTENT) - 1; - ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file), - Succeeds(WRITE_SIZE)); + ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file)); // This is a write-only file so reads should fail. - ASSERT_THAT(func(file), Fails(EBADF, EOF)); + ASSERT_EQ(func(file), EOF); // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; - ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds()); + ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); file = LIBC_NAMESPACE::fopen(filename, "r"); ASSERT_FALSE(file == nullptr); LIBC_NAMESPACE::flockfile(file); for (size_t i = 0; i < WRITE_SIZE; ++i) { - ASSERT_THAT(func(file), Succeeds(int('1' + i))); + int c = func(file); + ASSERT_EQ(c, int('1' + i)); } // Reading more should return EOF but not set error. - ASSERT_THAT(func(file), Succeeds(EOF)); + ASSERT_EQ(func(file), EOF); ASSERT_NE(LIBC_NAMESPACE::feof_unlocked(file), 0); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(file), 0); LIBC_NAMESPACE::funlockfile(file); - ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds()); + ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); } }; diff --git a/libc/test/src/stdio/fgets_test.cpp b/libc/test/src/stdio/fgets_test.cpp index ca8d4d4546635..abed3d4052939 100644 --- a/libc/test/src/stdio/fgets_test.cpp +++ b/libc/test/src/stdio/fgets_test.cpp @@ -12,14 +12,11 @@ #include "src/stdio/fgets.h" #include "src/stdio/fopen.h" #include "src/stdio/fwrite.h" -#include "test/UnitTest/ErrnoCheckingTest.h" -#include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -using LlvmLibcFgetsTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; -using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher; +#include "src/__support/libc_errno.h" -TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) { +TEST(LlvmLibcFgetsTest, WriteAndReadCharacters) { constexpr char FILENAME[] = "testdata/fgets.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); @@ -32,15 +29,15 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) { char buff[8]; char *output; - ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file), - Succeeds(WRITE_SIZE)); + ASSERT_EQ(WRITE_SIZE, LIBC_NAMESPACE::fwrite(CONTENT, 1, WRITE_SIZE, file)); // This is a write-only file so reads should fail. - ASSERT_THAT(LIBC_NAMESPACE::fgets(buff, 8, file), Fails(EBADF, nullptr)); + ASSERT_TRUE(LIBC_NAMESPACE::fgets(buff, 8, file) == nullptr); // This is an error and not a real EOF. ASSERT_EQ(LIBC_NAMESPACE::feof(file), 0); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; - ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds()); + ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); file = LIBC_NAMESPACE::fopen(FILENAME, "r"); ASSERT_FALSE(file == nullptr); @@ -58,7 +55,6 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) { // This is also implementation defined. output = LIBC_NAMESPACE::fgets(buff, 0, file); ASSERT_TRUE(output == nullptr); - ASSERT_ERRNO_SUCCESS(); #endif const char *output_arr[] = { @@ -90,5 +86,5 @@ TEST_F(LlvmLibcFgetsTest, WriteAndReadCharacters) { ASSERT_NE(LIBC_NAMESPACE::feof(file), 0); ASSERT_ERRNO_SUCCESS(); - ASSERT_THAT(LIBC_NAMESPACE::fclose(file), Succeeds()); + ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); } diff --git a/libc/test/src/stdio/fileop_test.cpp b/libc/test/src/stdio/fileop_test.cpp index e097785832d56..e624181c795b8 100644 --- a/libc/test/src/stdio/fileop_test.cpp +++ b/libc/test/src/stdio/fileop_test.cpp @@ -17,18 +17,17 @@ #include "src/stdio/fread.h" #include "src/stdio/fseek.h" #include "src/stdio/fwrite.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" +#include "src/__support/libc_errno.h" -using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::EQ; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::NE; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::returns; -TEST_F(LlvmLibcFILETest, SimpleFileOperations) { +TEST(LlvmLibcFILETest, SimpleFileOperations) { constexpr char FILENAME[] = "testdata/simple_operations.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); @@ -42,6 +41,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fread(read_data, 1, sizeof(CONTENT), file), returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); @@ -72,6 +72,7 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), file), returns(EQ(size_t(0))).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); @@ -79,12 +80,15 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_THAT(LIBC_NAMESPACE::fputs(CONTENT, file), returns(EQ(EOF)).with_errno(NE(0))); ASSERT_NE(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; LIBC_NAMESPACE::clearerr(file); ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::fwrite("nothing", 1, 1, file), returns(EQ(size_t(0))).with_errno(NE(0))); + libc_errno = 0; ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); @@ -99,8 +103,10 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { ASSERT_EQ(LIBC_NAMESPACE::ferror(file), 0); // This is not a readable file. + libc_errno = 0; ASSERT_THAT(LIBC_NAMESPACE::fread(data, 1, 1, file), returns(EQ(0)).with_errno(NE(0))); + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(file)); @@ -115,18 +121,21 @@ TEST_F(LlvmLibcFILETest, SimpleFileOperations) { // Check that the other functions correctly set libc_errno. + // libc_errno = 0; // ASSERT_NE(LIBC_NAMESPACE::fseek(file, 0, SEEK_SET), 0); // ASSERT_ERRNO_FAILURE(); + // libc_errno = 0; // ASSERT_NE(LIBC_NAMESPACE::fclose(file), 0); // ASSERT_ERRNO_FAILURE(); + // libc_errno = 0; // ASSERT_EQ(LIBC_NAMESPACE::fopen("INVALID FILE NAME", "r"), // static_cast(nullptr)); // ASSERT_ERRNO_FAILURE(); } -TEST_F(LlvmLibcFILETest, FFlush) { +TEST(LlvmLibcFILETest, FFlush) { constexpr char FILENAME[] = "testdata/fflush.test"; ::FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w+"); ASSERT_FALSE(file == nullptr); @@ -147,7 +156,7 @@ TEST_F(LlvmLibcFILETest, FFlush) { ASSERT_EQ(LIBC_NAMESPACE::fclose(file), 0); } -TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { +TEST(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { using MyStruct = struct { char c; unsigned long long i; @@ -156,6 +165,7 @@ TEST_F(LlvmLibcFILETest, FOpenFWriteSizeGreaterThanOne) { constexpr size_t WRITE_NMEMB = sizeof(WRITE_DATA) / sizeof(MyStruct); constexpr char FILENAME[] = "testdata/fread_fwrite.test"; + libc_errno = 0; FILE *file = LIBC_NAMESPACE::fopen(FILENAME, "w"); ASSERT_FALSE(file == nullptr); ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(WRITE_DATA, 0, 1, file)); diff --git a/libc/test/src/stdio/fopencookie_test.cpp b/libc/test/src/stdio/fopencookie_test.cpp index bcf5e674141a7..03e1ac286b646 100644 --- a/libc/test/src/stdio/fopencookie_test.cpp +++ b/libc/test/src/stdio/fopencookie_test.cpp @@ -15,7 +15,6 @@ #include "src/stdio/fread.h" #include "src/stdio/fseek.h" #include "src/stdio/fwrite.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/MemoryMatcher.h" #include "test/UnitTest/Test.h" @@ -23,7 +22,6 @@ #include "hdr/types/size_t.h" #include "src/__support/libc_errno.h" -using LlvmLibcFOpenCookieTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; using MemoryView = LIBC_NAMESPACE::testing::MemoryView; struct StringStream { @@ -90,7 +88,7 @@ int close_ss(void *cookie) { constexpr cookie_io_functions_t STRING_STREAM_FUNCS = {&read_ss, &write_ss, &seek_ss, &close_ss}; -TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) { +TEST(LlvmLibcFOpenCookie, ReadOnlyCookieTest) { constexpr char CONTENT[] = "Hello,readonly!"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(CONTENT))); @@ -117,6 +115,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) { ASSERT_EQ(size_t(0), LIBC_NAMESPACE::fwrite(CONTENT, 1, sizeof(CONTENT), f)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_FAILURE(); + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -125,7 +124,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadOnlyCookieTest) { free(ss); } -TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) { +TEST(LlvmLibcFOpenCookie, WriteOnlyCookieTest) { size_t INIT_BUFSIZE = 32; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(INIT_BUFSIZE)); @@ -150,6 +149,7 @@ TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) { LIBC_NAMESPACE::fread(read_data, 1, sizeof(WRITE_DATA), f)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_EQ(EBADF); + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -158,7 +158,7 @@ TEST_F(LlvmLibcFOpenCookieTest, WriteOnlyCookieTest) { free(ss); } -TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) { +TEST(LlvmLibcFOpenCookie, AppendOnlyCookieTest) { constexpr char INITIAL_CONTENT[] = "1234567890987654321"; constexpr char WRITE_DATA[] = "append"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); @@ -178,6 +178,7 @@ TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) { ASSERT_EQ(LIBC_NAMESPACE::fread(read_data, 1, READ_SIZE, f), size_t(0)); ASSERT_NE(LIBC_NAMESPACE::ferror(f), 0); ASSERT_ERRNO_FAILURE(); + libc_errno = 0; LIBC_NAMESPACE::clearerr(f); ASSERT_EQ(LIBC_NAMESPACE::ferror(f), 0); @@ -191,7 +192,7 @@ TEST_F(LlvmLibcFOpenCookieTest, AppendOnlyCookieTest) { free(ss); } -TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) { +TEST(LlvmLibcFOpenCookie, ReadUpdateCookieTest) { const char INITIAL_CONTENT[] = "1234567890987654321"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(INITIAL_CONTENT))); @@ -222,7 +223,7 @@ TEST_F(LlvmLibcFOpenCookieTest, ReadUpdateCookieTest) { free(ss); } -TEST_F(LlvmLibcFOpenCookieTest, WriteUpdateCookieTest) { +TEST(LlvmLibcFOpenCookie, WriteUpdateCookieTest) { constexpr char WRITE_DATA[] = "hello, file"; auto *ss = reinterpret_cast(malloc(sizeof(StringStream))); ss->buf = reinterpret_cast(malloc(sizeof(WRITE_DATA))); diff --git a/libc/test/src/stdio/remove_test.cpp b/libc/test/src/stdio/remove_test.cpp index 296bff1f5dc15..84984e26398c0 100644 --- a/libc/test/src/stdio/remove_test.cpp +++ b/libc/test/src/stdio/remove_test.cpp @@ -11,17 +11,16 @@ #include "src/sys/stat/mkdirat.h" #include "src/unistd/access.h" #include "src/unistd/close.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" +#include "src/__support/libc_errno.h" #include -using LlvmLibcRemoveTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - -TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) { +TEST(LlvmLibcRemoveTest, CreateAndRemoveFile) { // The test strategy is to create a file and remove it, and also verify that // it was removed. + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; @@ -37,9 +36,10 @@ TEST_F(LlvmLibcRemoveTest, CreateAndRemoveFile) { ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILE, F_OK), Fails(ENOENT)); } -TEST_F(LlvmLibcRemoveTest, CreateAndRemoveDir) { +TEST(LlvmLibcRemoveTest, CreateAndRemoveDir) { // The test strategy is to create a dir and remove it, and also verify that // it was removed. + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; constexpr const char *FILENAME = "remove.test.dir"; diff --git a/libc/test/src/stdio/rename_test.cpp b/libc/test/src/stdio/rename_test.cpp index 135fb98c07fbb..ac494a4ecaf8e 100644 --- a/libc/test/src/stdio/rename_test.cpp +++ b/libc/test/src/stdio/rename_test.cpp @@ -8,19 +8,18 @@ #include "include/llvm-libc-macros/linux/sys-stat-macros.h" #include "include/llvm-libc-macros/linux/unistd-macros.h" +#include "src/__support/libc_errno.h" #include "src/fcntl/open.h" #include "src/stdio/rename.h" #include "src/unistd/access.h" #include "src/unistd/close.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/ErrnoSetterMatcher.h" #include "test/UnitTest/Test.h" -using LlvmLibcRenameTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - -TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) { +TEST(LlvmLibcRenameTest, CreateAndRenameFile) { // The test strategy is to create a file and rename it, and also verify that // it was renamed. + libc_errno = 0; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds; @@ -41,7 +40,7 @@ TEST_F(LlvmLibcRenameTest, CreateAndRenameFile) { ASSERT_THAT(LIBC_NAMESPACE::access(TEST_FILEPATH0, F_OK), Fails(ENOENT)); } -TEST_F(LlvmLibcRenameTest, RenameNonExistent) { +TEST(LlvmLibcRenameTest, RenameNonExistent) { using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails; constexpr const char *FILENAME1 = "rename.test.file1"; diff --git a/libc/test/src/stdio/setvbuf_test.cpp b/libc/test/src/stdio/setvbuf_test.cpp index a0936ba79ef73..5872943c1bb41 100644 --- a/libc/test/src/stdio/setvbuf_test.cpp +++ b/libc/test/src/stdio/setvbuf_test.cpp @@ -11,14 +11,12 @@ #include "src/stdio/fread.h" #include "src/stdio/fwrite.h" #include "src/stdio/setvbuf.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" #include "hdr/stdio_macros.h" +#include "src/__support/libc_errno.h" -using LlvmLibcSetvbufTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; - -TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) { +TEST(LlvmLibcSetvbufTest, SetNBFBuffer) { // The idea in this test is that we open a file for writing and reading, and // then set a NBF buffer to the write handle. Since it is NBF, the data // written using the write handle should be immediately readable by the read @@ -54,7 +52,7 @@ TEST_F(LlvmLibcSetvbufTest, SetNBFBuffer) { ASSERT_EQ(0, LIBC_NAMESPACE::fclose(fr)); } -TEST_F(LlvmLibcSetvbufTest, SetLBFBuffer) { +TEST(LlvmLibcSetvbufTest, SetLBFBuffer) { // The idea in this test is that we open a file for writing and reading, and // then set a LBF buffer to the write handle. Since it is LBF, the data // written using the write handle should be available right after a '\n' is @@ -104,5 +102,6 @@ TEST(LlvmLibcSetbufTest, InvalidBufferMode) { 0); ASSERT_ERRNO_EQ(EINVAL); + libc_errno = 0; ASSERT_EQ(0, LIBC_NAMESPACE::fclose(f)); } diff --git a/libc/test/src/stdio/unlocked_fileop_test.cpp b/libc/test/src/stdio/unlocked_fileop_test.cpp index e99b382d12112..5d482b70064bd 100644 --- a/libc/test/src/stdio/unlocked_fileop_test.cpp +++ b/libc/test/src/stdio/unlocked_fileop_test.cpp @@ -15,12 +15,11 @@ #include "src/stdio/fread_unlocked.h" #include "src/stdio/funlockfile.h" #include "src/stdio/fwrite_unlocked.h" -#include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" -using LlvmLibcFILETest = LIBC_NAMESPACE::testing::ErrnoCheckingTest; +#include "src/__support/libc_errno.h" -TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) { +TEST(LlvmLibcFILETest, UnlockedReadAndWrite) { constexpr char fNAME[] = "testdata/unlocked_read_and_write.test"; ::FILE *f = LIBC_NAMESPACE::fopen(fNAME, "w"); ASSERT_FALSE(f == nullptr); @@ -37,6 +36,7 @@ TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) { LIBC_NAMESPACE::fread_unlocked(data, 1, sizeof(READ_SIZE), f)); ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0); ASSERT_ERRNO_FAILURE(); + libc_errno = 0; LIBC_NAMESPACE::clearerr_unlocked(f); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0); @@ -57,6 +57,7 @@ TEST_F(LlvmLibcFILETest, UnlockedReadAndWrite) { LIBC_NAMESPACE::fwrite_unlocked(CONTENT, 1, sizeof(CONTENT), f)); ASSERT_NE(LIBC_NAMESPACE::ferror_unlocked(f), 0); ASSERT_ERRNO_FAILURE(); + libc_errno = 0; LIBC_NAMESPACE::clearerr_unlocked(f); ASSERT_EQ(LIBC_NAMESPACE::ferror_unlocked(f), 0); diff --git a/libc/test/src/stdlib/StrtolTest.h b/libc/test/src/stdlib/StrtolTest.h index 03f0a6539c785..3eeccc5727e77 100644 --- a/libc/test/src/stdlib/StrtolTest.h +++ b/libc/test/src/stdlib/StrtolTest.h @@ -9,6 +9,7 @@ #include "src/__support/CPP/limits.h" #include "src/__support/CPP/type_traits.h" #include "src/__support/ctype_utils.h" +#include "src/__support/libc_errno.h" #include "src/__support/macros/properties/architectures.h" #include "test/UnitTest/ErrnoCheckingTest.h" #include "test/UnitTest/Test.h" diff --git a/libc/test/src/stdlib/strtold_test.cpp b/libc/test/src/stdlib/strtold_test.cpp index eb4056dc7ba64..c2f2b9c9a11c3 100644 --- a/libc/test/src/stdlib/strtold_test.cpp +++ b/libc/test/src/stdlib/strtold_test.cpp @@ -7,6 +7,7 @@ //===----------------------------------------------------------------------===// #include "src/__support/FPUtil/FPBits.h" +#include "src/__support/libc_errno.h" #include "src/__support/uint128.h" #include "src/stdlib/strtold.h" From 452276ecc0f5d1cb9bf5e1655e422a68eafdb8b9 Mon Sep 17 00:00:00 2001 From: Michael Jones Date: Fri, 13 Jun 2025 11:00:08 -0700 Subject: [PATCH 417/851] [libc] Fix missing errno include in fuzzer (#144132) The printf parser uses errno for setting up the %m conversion. It was presumably getting this include indirectly until a recent change. This patch adds a direct dependency to fix it. --- libc/fuzzing/stdio/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/libc/fuzzing/stdio/CMakeLists.txt b/libc/fuzzing/stdio/CMakeLists.txt index 8f89baa702000..401785a30469c 100644 --- a/libc/fuzzing/stdio/CMakeLists.txt +++ b/libc/fuzzing/stdio/CMakeLists.txt @@ -4,6 +4,7 @@ add_libc_fuzzer( printf_parser_fuzz.cpp DEPENDS libc.src.stdio.printf_core.parser + libc.src.errno.errno # needed for the strerror conversion ) add_libc_fuzzer( From 0c7ce6883a04dadd9daf0d41cba58c2f9eec19ad Mon Sep 17 00:00:00 2001 From: Charitha Saumya <136391709+charithaintc@users.noreply.github.com> Date: Fri, 13 Jun 2025 11:02:05 -0700 Subject: [PATCH 418/851] Revert "[mlir][vector] Fix for WarpOpScfForOp failure when scf.for has results that are unused." (#144124) Reverts llvm/llvm-project#141853 Reverting the bug fix because it does not handle all cases correctly. --- .../Vector/Transforms/VectorDistribute.cpp | 39 +++++-------------- .../Vector/vector-warp-distribute.mlir | 36 ----------------- 2 files changed, 10 insertions(+), 65 deletions(-) diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp index 52a9cedb43cc0..045c192787f10 100644 --- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp +++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp @@ -1554,36 +1554,22 @@ struct WarpOpScfForOp : public WarpDistributionPattern { llvm::SmallSetVector escapingValues; SmallVector inputTypes; SmallVector distTypes; - auto collectEscapingValues = [&](Value value) { - if (!escapingValues.insert(value)) - return; - Type distType = value.getType(); - if (auto vecType = dyn_cast(distType)) { - AffineMap map = distributionMapFn(value); - distType = getDistributedType(vecType, map, warpOp.getWarpSize()); - } - inputTypes.push_back(value.getType()); - distTypes.push_back(distType); - }; - mlir::visitUsedValuesDefinedAbove( forOp.getBodyRegion(), [&](OpOperand *operand) { Operation *parent = operand->get().getParentRegion()->getParentOp(); if (warpOp->isAncestor(parent)) { - collectEscapingValues(operand->get()); + if (!escapingValues.insert(operand->get())) + return; + Type distType = operand->get().getType(); + if (auto vecType = dyn_cast(distType)) { + AffineMap map = distributionMapFn(operand->get()); + distType = getDistributedType(vecType, map, warpOp.getWarpSize()); + } + inputTypes.push_back(operand->get().getType()); + distTypes.push_back(distType); } }); - // Any forOp result that is not already yielded by the warpOp - // region is also considered escaping and must be returned by the - // original warpOp. - for (OpResult forResult : forOp.getResults()) { - // Check if this forResult is already yielded by the yield op. - if (llvm::is_contained(yield->getOperands(), forResult)) - continue; - collectEscapingValues(forResult); - } - if (llvm::is_contained(distTypes, Type{})) return failure(); @@ -1623,12 +1609,7 @@ struct WarpOpScfForOp : public WarpDistributionPattern { forOp.getResultTypes().end()); llvm::SmallDenseMap argIndexMapping; for (auto [i, retIdx] : llvm::enumerate(newRetIndices)) { - auto newWarpResult = newWarpOp.getResult(retIdx); - // Unused forOp results yielded by the warpOp region are already included - // in the new ForOp. - if (llvm::is_contained(newOperands, newWarpResult)) - continue; - warpInput.push_back(newWarpResult); + warpInput.push_back(newWarpOp.getResult(retIdx)); argIndexMapping[escapingValues[i]] = warpInputType.size(); warpInputType.push_back(inputTypes[i]); } diff --git a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir index 6c7ac7a5196a7..38771f2593449 100644 --- a/mlir/test/Dialect/Vector/vector-warp-distribute.mlir +++ b/mlir/test/Dialect/Vector/vector-warp-distribute.mlir @@ -584,42 +584,6 @@ func.func @warp_scf_for_multiple_yield(%arg0: index, %arg1: memref, %arg2 return } -// ----- -// CHECK-PROP-LABEL: func.func @warp_scf_for_unused_yield( -// CHECK-PROP: %[[W0:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP: %[[INI0:.*]] = "some_def"() : () -> vector<128xf32> -// CHECK-PROP: %[[INI1:.*]] = "some_def"() : () -> vector<128xf32> -// CHECK-PROP: gpu.yield %[[INI0]], %[[INI1]] : vector<128xf32>, vector<128xf32> -// CHECK-PROP: } -// CHECK-PROP: %[[F:.*]]:2 = scf.for %{{.*}} iter_args(%{{.*}} = %[[W0]]#0, %{{.*}} = %[[W0]]#1) -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP: %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[32] args(%{{.*}} : vector<4xf32>, vector<4xf32>) -> (vector<4xf32>, vector<4xf32>) { -// CHECK-PROP: %[[ACC0:.*]] = "some_def"(%{{.*}}) : (vector<128xf32>, index) -> vector<128xf32> -// CHECK-PROP: %[[ACC1:.*]] = "some_def"(%{{.*}}) : (index, vector<128xf32>, vector<128xf32>) -> vector<128xf32> -// CHECK-PROP: gpu.yield %[[ACC1]], %[[ACC0]] : vector<128xf32>, vector<128xf32> -// CHECK-PROP: } -// CHECK-PROP: scf.yield %[[W1]]#0, %[[W1]]#1 : vector<4xf32>, vector<4xf32> -// CHECK-PROP: } -// CHECK-PROP: "some_use"(%[[F]]#0) : (vector<4xf32>) -> () -func.func @warp_scf_for_unused_yield(%arg0: index) { - %c128 = arith.constant 128 : index - %c1 = arith.constant 1 : index - %c0 = arith.constant 0 : index - %0 = gpu.warp_execute_on_lane_0(%arg0)[32] -> (vector<4xf32>) { - %ini = "some_def"() : () -> (vector<128xf32>) - %ini1 = "some_def"() : () -> (vector<128xf32>) - %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini, %arg5 = %ini1) -> (vector<128xf32>, vector<128xf32>) { - %add = arith.addi %arg3, %c1 : index - %1 = "some_def"(%arg5, %add) : (vector<128xf32>, index) -> (vector<128xf32>) - %acc = "some_def"(%add, %arg4, %1) : (index, vector<128xf32>, vector<128xf32>) -> (vector<128xf32>) - scf.yield %acc, %1 : vector<128xf32>, vector<128xf32> - } - gpu.yield %3#0 : vector<128xf32> - } - "some_use"(%0) : (vector<4xf32>) -> () - return -} - - // ----- // CHECK-PROP-LABEL: func @vector_reduction( From f82cf7442029d3376813db82eca60800e999bfb9 Mon Sep 17 00:00:00 2001 From: Artem Gindinson Date: Fri, 13 Jun 2025 20:03:24 +0200 Subject: [PATCH 419/851] =?UTF-8?q?[mlir][tensor]=20Fix=20`getReassociatio?= =?UTF-8?q?nForCollapse`=20for=20tensor/scalar=20re=E2=80=A6=20(#144118)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …shapes Commit 6e5a142 changed the behavior of the function when computing reassociations between tensors (consisting of unit/dynamic dimensions) and scalars/0d vectors. The IR representation for such reshapes actually expects an empty reassociation, like so: ``` func.func @example(%arg0 : tensor) -> tensor { %0 = tensor.collapse_shape %arg0 [] : tensor into tensor } ``` Restore the original behavior - the routine should resort to reporting failures when compile time-known non-unit dimensions are part of the attempted reassociation. Signed-off-by: Artem Gindinson --- mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp | 10 ++++------ mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp | 8 ++++---- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp index 3b1fdb69e8ef1..aa566c0086a2f 100644 --- a/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp +++ b/mlir/lib/Dialect/Utils/ReshapeOpsUtils.cpp @@ -299,19 +299,17 @@ mlir::getReassociationIndicesForCollapse(ArrayRef sourceShape, // this utility). if (numSourceDims <= numTargetDims) return std::nullopt; - // Early handling for scalar target types. + // Early handling for scalar target types. We should report an invalid + // reassociation for non-unit static dimensions - no chance to collapse these + // into a scalar. if (numTargetDims == 0) { - ReassociationIndices allSourceIndices; - allSourceIndices.reserve(numSourceDims); for (unsigned sourceDimIdx = 0; sourceDimIdx < numSourceDims; ++sourceDimIdx) { int64_t sourceSize = sourceShape[sourceDimIdx]; - // All source dimensions must be unit or dynamic. if (sourceSize != 1 && sourceSize != ShapedType::kDynamic) return std::nullopt; - allSourceIndices.push_back(sourceDimIdx); } - return SmallVector{allSourceIndices}; + return SmallVector{}; } // Collect source ranges by iterating over the target shape left-to-right. diff --git a/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp b/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp index db1a87a4de2d5..05f97e875e2dc 100644 --- a/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp +++ b/mlir/unittests/Dialect/Utils/ReshapeOpsUtilsTest.cpp @@ -23,16 +23,16 @@ makeOptionalIndices(std::initializer_list list) { TEST(ReassociationIndicesForCollapse, ScalarTest) { EXPECT_EQ(getReassociationIndicesForCollapse({1}, {}), - makeOptionalIndices({{0}})); + makeOptionalIndices({})); EXPECT_EQ(getReassociationIndicesForCollapse({1, 1}, {}), - makeOptionalIndices({{0, 1}})); + makeOptionalIndices({})); EXPECT_EQ(getReassociationIndicesForCollapse({ShapedType::kDynamic}, {}), - makeOptionalIndices({{0}})); + makeOptionalIndices({})); EXPECT_EQ(getReassociationIndicesForCollapse({1, ShapedType::kDynamic, ShapedType::kDynamic, 1, ShapedType::kDynamic}, {}), - makeOptionalIndices({{0, 1, 2, 3, 4}})); + makeOptionalIndices({})); } TEST(ReassociationIndicesForCollapse, ScalarTestFailure) { From 52d34865b9db3485c8a671a88cc571270349f720 Mon Sep 17 00:00:00 2001 From: FYK Date: Sat, 14 Jun 2025 02:05:16 +0800 Subject: [PATCH 420/851] Fix and reapply IR PGO support for Flang (#142892) This PR resubmits the changes from #136098, which was previously reverted due to a build failure during the linking stage: ``` undefined reference to `llvm::DebugInfoCorrelate' undefined reference to `llvm::ProfileCorrelate' ``` The root cause was that `llvm/lib/Frontend/Driver/CodeGenOptions.cpp` references symbols from the `Instrumentation` component, but the `LINK_COMPONENTS` in the `llvm/lib/Frontend/CMakeLists.txt` for `LLVMFrontendDriver` did not include it. As a result, linking failed in configurations where these components were not transitively linked. ### Fix: This updated patch explicitly adds `Instrumentation` to `LINK_COMPONENTS` in the relevant `llvm/lib/Frontend/CMakeLists.txt` file to ensure the required symbols are properly resolved. --------- Co-authored-by: ict-ql <168183727+ict-ql@users.noreply.github.com> Co-authored-by: Chyaka <52224511+liliumshade@users.noreply.github.com> Co-authored-by: Tarun Prabhu --- clang/include/clang/Basic/CodeGenOptions.def | 6 ++- clang/include/clang/Basic/CodeGenOptions.h | 32 +++++++--------- clang/include/clang/Basic/ProfileList.h | 9 ++--- clang/include/clang/Driver/Options.td | 6 +-- clang/lib/Basic/ProfileList.cpp | 22 +++++------ clang/lib/CodeGen/BackendUtil.cpp | 9 +---- clang/lib/CodeGen/CodeGenAction.cpp | 4 +- clang/lib/CodeGen/CodeGenFunction.cpp | 3 +- clang/lib/CodeGen/CodeGenModule.cpp | 2 +- clang/lib/Driver/ToolChains/Flang.cpp | 4 ++ clang/lib/Frontend/CompilerInvocation.cpp | 6 +-- .../include/flang/Frontend/CodeGenOptions.def | 7 ++++ flang/include/flang/Frontend/CodeGenOptions.h | 38 +++++++++++++++++++ flang/lib/Frontend/CompilerInvocation.cpp | 10 +++++ flang/lib/Frontend/FrontendActions.cpp | 26 +++++++++++++ flang/test/Driver/flang-f-opts.f90 | 5 +++ .../Inputs/gcc-flag-compatibility_IR.proftext | 18 +++++++++ .../gcc-flag-compatibility_IR_entry.proftext | 11 ++++++ flang/test/Profile/gcc-flag-compatibility.f90 | 32 ++++++++++++++++ .../llvm/Frontend/Driver/CodeGenOptions.h | 13 +++++++ llvm/lib/Frontend/Driver/CMakeLists.txt | 1 + llvm/lib/Frontend/Driver/CodeGenOptions.cpp | 13 +++++++ 22 files changed, 223 insertions(+), 54 deletions(-) create mode 100644 flang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext create mode 100644 flang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext create mode 100644 flang/test/Profile/gcc-flag-compatibility.f90 diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def index fa9474d63ae42..2a30ff11464dd 100644 --- a/clang/include/clang/Basic/CodeGenOptions.def +++ b/clang/include/clang/Basic/CodeGenOptions.def @@ -223,9 +223,11 @@ AFFECTING_VALUE_CODEGENOPT(OptimizeSize, 2, 0) ///< If -Os (==1) or -Oz (==2) is CODEGENOPT(AtomicProfileUpdate , 1, 0) ///< Set -fprofile-update=atomic CODEGENOPT(ContinuousProfileSync, 1, 0) ///< Enable continuous instrumentation profiling /// Choose profile instrumenation kind or no instrumentation. -ENUM_CODEGENOPT(ProfileInstr, ProfileInstrKind, 4, ProfileNone) + +ENUM_CODEGENOPT(ProfileInstr, llvm::driver::ProfileInstrKind, 4, llvm::driver::ProfileInstrKind::ProfileNone) + /// Choose profile kind for PGO use compilation. -ENUM_CODEGENOPT(ProfileUse, ProfileInstrKind, 2, ProfileNone) +ENUM_CODEGENOPT(ProfileUse, llvm::driver::ProfileInstrKind, 2, llvm::driver::ProfileInstrKind::ProfileNone) /// Partition functions into N groups and select only functions in group i to be /// instrumented. Selected group numbers can be 0 to N-1 inclusive. VALUE_CODEGENOPT(ProfileTotalFunctionGroups, 32, 1) diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h index a77232c281f7f..7ba21fca6dd6b 100644 --- a/clang/include/clang/Basic/CodeGenOptions.h +++ b/clang/include/clang/Basic/CodeGenOptions.h @@ -80,16 +80,6 @@ class CodeGenOptions : public CodeGenOptionsBase { SRCK_InRegs // Small structs in registers (-freg-struct-return). }; - enum ProfileInstrKind { - ProfileNone, // Profile instrumentation is turned off. - ProfileClangInstr, // Clang instrumentation to generate execution counts - // to use with PGO. - ProfileIRInstr, // IR level PGO instrumentation in LLVM. - ProfileCSIRInstr, // IR level PGO context sensitive instrumentation in LLVM. - ProfileIRSampleColdCov, // IR level sample pgo based cold function coverage - // instrumentation in LLVM. - }; - enum EmbedBitcodeKind { Embed_Off, // No embedded bitcode. Embed_All, // Embed both bitcode and commandline in the output. @@ -522,35 +512,41 @@ class CodeGenOptions : public CodeGenOptionsBase { /// Check if Clang profile instrumenation is on. bool hasProfileClangInstr() const { - return getProfileInstr() == ProfileClangInstr; + return getProfileInstr() == + llvm::driver::ProfileInstrKind::ProfileClangInstr; } /// Check if IR level profile instrumentation is on. bool hasProfileIRInstr() const { - return getProfileInstr() == ProfileIRInstr; + return getProfileInstr() == llvm::driver::ProfileInstrKind::ProfileIRInstr; } /// Check if CS IR level profile instrumentation is on. bool hasProfileCSIRInstr() const { - return getProfileInstr() == ProfileCSIRInstr; + return getProfileInstr() == + llvm::driver::ProfileInstrKind::ProfileCSIRInstr; } /// Check if any form of instrumentation is on. - bool hasProfileInstr() const { return getProfileInstr() != ProfileNone; } + bool hasProfileInstr() const { + return getProfileInstr() != llvm::driver::ProfileInstrKind::ProfileNone; + } /// Check if Clang profile use is on. bool hasProfileClangUse() const { - return getProfileUse() == ProfileClangInstr; + return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileClangInstr; } /// Check if IR level profile use is on. bool hasProfileIRUse() const { - return getProfileUse() == ProfileIRInstr || - getProfileUse() == ProfileCSIRInstr; + return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileIRInstr || + getProfileUse() == llvm::driver::ProfileInstrKind::ProfileCSIRInstr; } /// Check if CSIR profile use is on. - bool hasProfileCSIRUse() const { return getProfileUse() == ProfileCSIRInstr; } + bool hasProfileCSIRUse() const { + return getProfileUse() == llvm::driver::ProfileInstrKind::ProfileCSIRInstr; + } /// Check if type and variable info should be emitted. bool hasReducedDebugInfo() const { diff --git a/clang/include/clang/Basic/ProfileList.h b/clang/include/clang/Basic/ProfileList.h index b4217e49c18a3..5338ef3992ade 100644 --- a/clang/include/clang/Basic/ProfileList.h +++ b/clang/include/clang/Basic/ProfileList.h @@ -49,17 +49,16 @@ class ProfileList { ~ProfileList(); bool isEmpty() const { return Empty; } - ExclusionType getDefault(CodeGenOptions::ProfileInstrKind Kind) const; + ExclusionType getDefault(llvm::driver::ProfileInstrKind Kind) const; std::optional isFunctionExcluded(StringRef FunctionName, - CodeGenOptions::ProfileInstrKind Kind) const; + llvm::driver::ProfileInstrKind Kind) const; std::optional isLocationExcluded(SourceLocation Loc, - CodeGenOptions::ProfileInstrKind Kind) const; + llvm::driver::ProfileInstrKind Kind) const; std::optional - isFileExcluded(StringRef FileName, - CodeGenOptions::ProfileInstrKind Kind) const; + isFileExcluded(StringRef FileName, llvm::driver::ProfileInstrKind Kind) const; }; } // namespace clang diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td index 152df89118a6a..5951687b095e4 100644 --- a/clang/include/clang/Driver/Options.td +++ b/clang/include/clang/Driver/Options.td @@ -1772,7 +1772,7 @@ def fmcdc_max_test_vectors_EQ : Joined<["-"], "fmcdc-max-test-vectors=">, HelpText<"Maximum number of test vectors in MC/DC coverage">, MarshallingInfoInt, "0x7FFFFFFE">; def fprofile_generate : Flag<["-"], "fprofile-generate">, - Group, Visibility<[ClangOption, CLOption]>, + Group, Visibility<[ClangOption, CLOption, FlangOption, FC1Option]>, HelpText<"Generate instrumented code to collect execution counts into default.profraw (overridden by LLVM_PROFILE_FILE env var)">; def fprofile_generate_EQ : Joined<["-"], "fprofile-generate=">, Group, Visibility<[ClangOption, CLOption]>, @@ -1789,7 +1789,7 @@ def fprofile_use : Flag<["-"], "fprofile-use">, Group, Visibility<[ClangOption, CLOption]>, Alias; def fprofile_use_EQ : Joined<["-"], "fprofile-use=">, Group, - Visibility<[ClangOption, CLOption]>, + Visibility<[ClangOption, CLOption, FlangOption, FC1Option]>, MetaVarName<"">, HelpText<"Use instrumentation data for profile-guided optimization. If pathname is a directory, it reads from /default.profdata. Otherwise, it reads from file .">; def fno_profile_instr_generate : Flag<["-"], "fno-profile-instr-generate">, @@ -7761,7 +7761,7 @@ def fpatchable_function_entry_section_EQ MarshallingInfoString>; def fprofile_instrument_EQ : Joined<["-"], "fprofile-instrument=">, HelpText<"Enable PGO instrumentation">, Values<"none,clang,llvm,csllvm,sample-coldcov">, - NormalizedValuesScope<"CodeGenOptions">, + NormalizedValuesScope<"llvm::driver::ProfileInstrKind">, NormalizedValues<["ProfileNone", "ProfileClangInstr", "ProfileIRInstr", "ProfileCSIRInstr", "ProfileIRSampleColdCov"]>, MarshallingInfoEnum, "ProfileNone">; def fprofile_instrument_path_EQ : Joined<["-"], "fprofile-instrument-path=">, diff --git a/clang/lib/Basic/ProfileList.cpp b/clang/lib/Basic/ProfileList.cpp index aaea5a00ab6ae..8481deffe2a7b 100644 --- a/clang/lib/Basic/ProfileList.cpp +++ b/clang/lib/Basic/ProfileList.cpp @@ -69,24 +69,24 @@ ProfileList::ProfileList(ArrayRef Paths, SourceManager &SM) ProfileList::~ProfileList() = default; -static StringRef getSectionName(CodeGenOptions::ProfileInstrKind Kind) { +static StringRef getSectionName(llvm::driver::ProfileInstrKind Kind) { switch (Kind) { - case CodeGenOptions::ProfileNone: + case llvm::driver::ProfileInstrKind::ProfileNone: return ""; - case CodeGenOptions::ProfileClangInstr: + case llvm::driver::ProfileInstrKind::ProfileClangInstr: return "clang"; - case CodeGenOptions::ProfileIRInstr: + case llvm::driver::ProfileInstrKind::ProfileIRInstr: return "llvm"; - case CodeGenOptions::ProfileCSIRInstr: + case llvm::driver::ProfileInstrKind::ProfileCSIRInstr: return "csllvm"; - case CodeGenOptions::ProfileIRSampleColdCov: + case llvm::driver::ProfileInstrKind::ProfileIRSampleColdCov: return "sample-coldcov"; } - llvm_unreachable("Unhandled CodeGenOptions::ProfileInstrKind enum"); + llvm_unreachable("Unhandled llvm::driver::ProfileInstrKind enum"); } ProfileList::ExclusionType -ProfileList::getDefault(CodeGenOptions::ProfileInstrKind Kind) const { +ProfileList::getDefault(llvm::driver::ProfileInstrKind Kind) const { StringRef Section = getSectionName(Kind); // Check for "default:" if (SCL->inSection(Section, "default", "allow")) @@ -117,7 +117,7 @@ ProfileList::inSection(StringRef Section, StringRef Prefix, std::optional ProfileList::isFunctionExcluded(StringRef FunctionName, - CodeGenOptions::ProfileInstrKind Kind) const { + llvm::driver::ProfileInstrKind Kind) const { StringRef Section = getSectionName(Kind); // Check for "function:=" if (auto V = inSection(Section, "function", FunctionName)) @@ -131,13 +131,13 @@ ProfileList::isFunctionExcluded(StringRef FunctionName, std::optional ProfileList::isLocationExcluded(SourceLocation Loc, - CodeGenOptions::ProfileInstrKind Kind) const { + llvm::driver::ProfileInstrKind Kind) const { return isFileExcluded(SM.getFilename(SM.getFileLoc(Loc)), Kind); } std::optional ProfileList::isFileExcluded(StringRef FileName, - CodeGenOptions::ProfileInstrKind Kind) const { + llvm::driver::ProfileInstrKind Kind) const { StringRef Section = getSectionName(Kind); // Check for "source:=" if (auto V = inSection(Section, "source", FileName)) diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp index 88b3a4943e0d8..7e0a3cf5591ce 100644 --- a/clang/lib/CodeGen/BackendUtil.cpp +++ b/clang/lib/CodeGen/BackendUtil.cpp @@ -124,17 +124,10 @@ namespace clang { extern llvm::cl::opt ClSanitizeGuardChecks; } -// Default filename used for profile generation. -static std::string getDefaultProfileGenName() { - return DebugInfoCorrelate || ProfileCorrelate != InstrProfCorrelator::NONE - ? "default_%m.proflite" - : "default_%m.profraw"; -} - // Path and name of file used for profile generation static std::string getProfileGenName(const CodeGenOptions &CodeGenOpts) { std::string FileName = CodeGenOpts.InstrProfileOutput.empty() - ? getDefaultProfileGenName() + ? llvm::driver::getDefaultProfileGenName() : CodeGenOpts.InstrProfileOutput; if (CodeGenOpts.ContinuousProfileSync) FileName = "%c" + FileName; diff --git a/clang/lib/CodeGen/CodeGenAction.cpp b/clang/lib/CodeGen/CodeGenAction.cpp index 1f5eb427b566f..5493cc92bd8b0 100644 --- a/clang/lib/CodeGen/CodeGenAction.cpp +++ b/clang/lib/CodeGen/CodeGenAction.cpp @@ -273,8 +273,8 @@ void BackendConsumer::HandleTranslationUnit(ASTContext &C) { std::unique_ptr OptRecordFile = std::move(*OptRecordFileOrErr); - if (OptRecordFile && - CodeGenOpts.getProfileUse() != CodeGenOptions::ProfileNone) + if (OptRecordFile && CodeGenOpts.getProfileUse() != + llvm::driver::ProfileInstrKind::ProfileNone) Ctx.setDiagnosticsHotnessRequested(true); if (CodeGenOpts.MisExpect) { diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp index 56562002e7194..13d0633e9b1c0 100644 --- a/clang/lib/CodeGen/CodeGenFunction.cpp +++ b/clang/lib/CodeGen/CodeGenFunction.cpp @@ -943,7 +943,8 @@ void CodeGenFunction::StartFunction(GlobalDecl GD, QualType RetTy, } } - if (CGM.getCodeGenOpts().getProfileInstr() != CodeGenOptions::ProfileNone) { + if (CGM.getCodeGenOpts().getProfileInstr() != + llvm::driver::ProfileInstrKind::ProfileNone) { switch (CGM.isFunctionBlockedFromProfileInstr(Fn, Loc)) { case ProfileList::Skip: Fn->addFnAttr(llvm::Attribute::SkipProfile); diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp index 16e49aab4fe61..451792dca40c5 100644 --- a/clang/lib/CodeGen/CodeGenModule.cpp +++ b/clang/lib/CodeGen/CodeGenModule.cpp @@ -3608,7 +3608,7 @@ CodeGenModule::isFunctionBlockedByProfileList(llvm::Function *Fn, // If the profile list is empty, then instrument everything. if (ProfileList.isEmpty()) return ProfileList::Allow; - CodeGenOptions::ProfileInstrKind Kind = getCodeGenOpts().getProfileInstr(); + llvm::driver::ProfileInstrKind Kind = getCodeGenOpts().getProfileInstr(); // First, check the function name. if (auto V = ProfileList.isFunctionExcluded(Fn->getName(), Kind)) return *V; diff --git a/clang/lib/Driver/ToolChains/Flang.cpp b/clang/lib/Driver/ToolChains/Flang.cpp index a20879dad94d4..47d0e345086b2 100644 --- a/clang/lib/Driver/ToolChains/Flang.cpp +++ b/clang/lib/Driver/ToolChains/Flang.cpp @@ -887,6 +887,10 @@ void Flang::ConstructJob(Compilation &C, const JobAction &JA, // TODO: Handle interactions between -w, -pedantic, -Wall, -WOption Args.AddLastArg(CmdArgs, options::OPT_w); + // recognise options: fprofile-generate -fprofile-use= + Args.addAllArgs( + CmdArgs, {options::OPT_fprofile_generate, options::OPT_fprofile_use_EQ}); + // Forward flags for OpenMP. We don't do this if the current action is an // device offloading action other than OpenMP. if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ, diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp index 2c02719121c73..dd021ad2e441b 100644 --- a/clang/lib/Frontend/CompilerInvocation.cpp +++ b/clang/lib/Frontend/CompilerInvocation.cpp @@ -1492,11 +1492,11 @@ static void setPGOUseInstrumentor(CodeGenOptions &Opts, // which is available (might be one or both). if (PGOReader->isIRLevelProfile() || PGOReader->hasMemoryProfile()) { if (PGOReader->hasCSIRLevelProfile()) - Opts.setProfileUse(CodeGenOptions::ProfileCSIRInstr); + Opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileCSIRInstr); else - Opts.setProfileUse(CodeGenOptions::ProfileIRInstr); + Opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileIRInstr); } else - Opts.setProfileUse(CodeGenOptions::ProfileClangInstr); + Opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileClangInstr); } void CompilerInvocation::setDefaultPointerAuthOptions( diff --git a/flang/include/flang/Frontend/CodeGenOptions.def b/flang/include/flang/Frontend/CodeGenOptions.def index a697872836569..ae12aec518108 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.def +++ b/flang/include/flang/Frontend/CodeGenOptions.def @@ -24,8 +24,15 @@ CODEGENOPT(OptimizationLevel, 2, 0) ///< The -O[0-3] option specified. CODEGENOPT(DebugPassManager, 1, 0) ///< Prints debug information for the new ///< pass manager. + +/// Choose profile instrumenation kind or no instrumentation. +ENUM_CODEGENOPT(ProfileInstr, llvm::driver::ProfileInstrKind, 2, llvm::driver::ProfileInstrKind::ProfileNone) +/// Choose profile kind for PGO use compilation. +ENUM_CODEGENOPT(ProfileUse, llvm::driver::ProfileInstrKind, 2, llvm::driver::ProfileInstrKind::ProfileNone) + CODEGENOPT(InstrumentFunctions, 1, 0) ///< Set when -finstrument_functions is ///< enabled on the compile step. + CODEGENOPT(IsPIE, 1, 0) ///< PIE level is the same as PIC Level. CODEGENOPT(PICLevel, 2, 0) ///< PIC level of the LLVM module. CODEGENOPT(PrepareForFullLTO , 1, 0) ///< Set when -flto is enabled on the diff --git a/flang/include/flang/Frontend/CodeGenOptions.h b/flang/include/flang/Frontend/CodeGenOptions.h index e939f10f3c3e7..bad17c8309eb8 100644 --- a/flang/include/flang/Frontend/CodeGenOptions.h +++ b/flang/include/flang/Frontend/CodeGenOptions.h @@ -154,6 +154,44 @@ class CodeGenOptions : public CodeGenOptionsBase { /// OpenMP is enabled. using DoConcurrentMappingKind = flangomp::DoConcurrentMappingKind; + /// Name of the profile file to use as output for -fprofile-instr-generate, + /// -fprofile-generate, and -fcs-profile-generate. + std::string InstrProfileOutput; + + /// Name of the profile file to use as input for -fmemory-profile-use. + std::string MemoryProfileUsePath; + + /// Name of the profile file to use as input for -fprofile-instr-use + std::string ProfileInstrumentUsePath; + + /// Name of the profile remapping file to apply to the profile data supplied + /// by -fprofile-sample-use or -fprofile-instr-use. + std::string ProfileRemappingFile; + + /// Check if Clang profile instrumenation is on. + bool hasProfileClangInstr() const { + return getProfileInstr() == llvm::driver::ProfileClangInstr; + } + + /// Check if IR level profile instrumentation is on. + bool hasProfileIRInstr() const { + return getProfileInstr() == llvm::driver::ProfileIRInstr; + } + + /// Check if CS IR level profile instrumentation is on. + bool hasProfileCSIRInstr() const { + return getProfileInstr() == llvm::driver::ProfileCSIRInstr; + } + /// Check if IR level profile use is on. + bool hasProfileIRUse() const { + return getProfileUse() == llvm::driver::ProfileIRInstr || + getProfileUse() == llvm::driver::ProfileCSIRInstr; + } + /// Check if CSIR profile use is on. + bool hasProfileCSIRUse() const { + return getProfileUse() == llvm::driver::ProfileCSIRInstr; + } + // Define accessors/mutators for code generation options of enumeration type. #define CODEGENOPT(Name, Bits, Default) #define ENUM_CODEGENOPT(Name, Type, Bits, Default) \ diff --git a/flang/lib/Frontend/CompilerInvocation.cpp b/flang/lib/Frontend/CompilerInvocation.cpp index 15bcff254756e..147849b0b7d2a 100644 --- a/flang/lib/Frontend/CompilerInvocation.cpp +++ b/flang/lib/Frontend/CompilerInvocation.cpp @@ -29,6 +29,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Frontend/Debug/Options.h" +#include "llvm/Frontend/Driver/CodeGenOptions.h" #include "llvm/Option/Arg.h" #include "llvm/Option/ArgList.h" #include "llvm/Option/OptTable.h" @@ -441,6 +442,15 @@ static void parseCodeGenArgs(Fortran::frontend::CodeGenOptions &opts, opts.IsPIE = 1; } + if (args.hasArg(clang::driver::options::OPT_fprofile_generate)) { + opts.setProfileInstr(llvm::driver::ProfileInstrKind::ProfileIRInstr); + } + + if (auto A = args.getLastArg(clang::driver::options::OPT_fprofile_use_EQ)) { + opts.setProfileUse(llvm::driver::ProfileInstrKind::ProfileIRInstr); + opts.ProfileInstrumentUsePath = A->getValue(); + } + // -mcmodel option. if (const llvm::opt::Arg *a = args.getLastArg(clang::driver::options::OPT_mcmodel_EQ)) { diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp index 1c8a419188b89..d684eeb696755 100644 --- a/flang/lib/Frontend/FrontendActions.cpp +++ b/flang/lib/Frontend/FrontendActions.cpp @@ -56,10 +56,12 @@ #include "llvm/Passes/PassBuilder.h" #include "llvm/Passes/PassPlugin.h" #include "llvm/Passes/StandardInstrumentations.h" +#include "llvm/ProfileData/InstrProfCorrelator.h" #include "llvm/Support/AMDGPUAddrSpace.h" #include "llvm/Support/Error.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Support/PGOOptions.h" #include "llvm/Support/Path.h" #include "llvm/Support/SourceMgr.h" #include "llvm/Support/ToolOutputFile.h" @@ -67,6 +69,7 @@ #include "llvm/TargetParser/RISCVISAInfo.h" #include "llvm/TargetParser/RISCVTargetParser.h" #include "llvm/Transforms/IPO/Internalize.h" +#include "llvm/Transforms/Instrumentation/InstrProfiling.h" #include "llvm/Transforms/Utils/ModuleUtils.h" #include #include @@ -919,6 +922,29 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) { llvm::PassInstrumentationCallbacks pic; llvm::PipelineTuningOptions pto; std::optional pgoOpt; + + if (opts.hasProfileIRInstr()) { + // -fprofile-generate. + pgoOpt = llvm::PGOOptions(opts.InstrProfileOutput.empty() + ? llvm::driver::getDefaultProfileGenName() + : opts.InstrProfileOutput, + "", "", opts.MemoryProfileUsePath, nullptr, + llvm::PGOOptions::IRInstr, + llvm::PGOOptions::NoCSAction, + llvm::PGOOptions::ColdFuncOpt::Default, false, + /*PseudoProbeForProfiling=*/false, false); + } else if (opts.hasProfileIRUse()) { + llvm::IntrusiveRefCntPtr VFS = + llvm::vfs::getRealFileSystem(); + // -fprofile-use. + auto CSAction = opts.hasProfileCSIRUse() ? llvm::PGOOptions::CSIRUse + : llvm::PGOOptions::NoCSAction; + pgoOpt = llvm::PGOOptions( + opts.ProfileInstrumentUsePath, "", opts.ProfileRemappingFile, + opts.MemoryProfileUsePath, VFS, llvm::PGOOptions::IRUse, CSAction, + llvm::PGOOptions::ColdFuncOpt::Default, false); + } + llvm::StandardInstrumentations si(llvmModule->getContext(), opts.DebugPassManager); si.registerCallbacks(pic, &mam); diff --git a/flang/test/Driver/flang-f-opts.f90 b/flang/test/Driver/flang-f-opts.f90 index 4493a519e2010..b972b9b7b2a59 100644 --- a/flang/test/Driver/flang-f-opts.f90 +++ b/flang/test/Driver/flang-f-opts.f90 @@ -8,3 +8,8 @@ ! CHECK-LABEL: "-fc1" ! CHECK: -ffp-contract=off ! CHECK: -O3 + +! RUN: %flang -### -S -fprofile-generate %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-GENERATE-LLVM %s +! CHECK-PROFILE-GENERATE-LLVM: "-fprofile-generate" +! RUN: %flang -### -S -fprofile-use=%S %s 2>&1 | FileCheck -check-prefix=CHECK-PROFILE-USE-DIR %s +! CHECK-PROFILE-USE-DIR: "-fprofile-use={{.*}}" diff --git a/flang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext b/flang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext new file mode 100644 index 0000000000000..2650fb5ebfd35 --- /dev/null +++ b/flang/test/Profile/Inputs/gcc-flag-compatibility_IR.proftext @@ -0,0 +1,18 @@ +# IR level Instrumentation Flag +:ir +_QQmain +# Func Hash: +146835646621254984 +# Num Counters: +2 +# Counter Values: +100 +1 + +main +# Func Hash: +742261418966908927 +# Num Counters: +1 +# Counter Values: +1 \ No newline at end of file diff --git a/flang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext b/flang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext new file mode 100644 index 0000000000000..c4a2a26557e80 --- /dev/null +++ b/flang/test/Profile/Inputs/gcc-flag-compatibility_IR_entry.proftext @@ -0,0 +1,11 @@ +# IR level Instrumentation Flag +:ir +:entry_first +_QQmain +# Func Hash: +146835646621254984 +# Num Counters: +2 +# Counter Values: +100 +1 \ No newline at end of file diff --git a/flang/test/Profile/gcc-flag-compatibility.f90 b/flang/test/Profile/gcc-flag-compatibility.f90 new file mode 100644 index 0000000000000..4490c45232d28 --- /dev/null +++ b/flang/test/Profile/gcc-flag-compatibility.f90 @@ -0,0 +1,32 @@ +! Tests for -fprofile-generate and -fprofile-use flag compatibility. These two +! flags behave similarly to their GCC counterparts: +! +! -fprofile-generate Generates the profile file ./default.profraw +! -fprofile-use=/file Uses the profile file /file + +! On AIX, -flto used to be required with -fprofile-generate. gcc-flag-compatibility-aix.c is used to do the testing on AIX with -flto +! RUN: %flang %s -c -S -o - -emit-llvm -fprofile-generate | FileCheck -check-prefix=PROFILE-GEN %s +! PROFILE-GEN: @__profc_{{_?}}main = {{(private|internal)}} global [1 x i64] zeroinitializer, section +! PROFILE-GEN: @__profd_{{_?}}main = + +! Check that -fprofile-use=some/path/file.prof reads some/path/file.prof +! This uses LLVM IR format profile. +! RUN: rm -rf %t.dir +! RUN: mkdir -p %t.dir/some/path +! RUN: llvm-profdata merge %S/Inputs/gcc-flag-compatibility_IR.proftext -o %t.dir/some/path/file.prof +! RUN: %flang %s -o - -emit-llvm -S -fprofile-use=%t.dir/some/path/file.prof | FileCheck -check-prefix=PROFILE-USE-IR1 %s +! RUN: llvm-profdata merge %S/Inputs/gcc-flag-compatibility_IR_entry.proftext -o %t.dir/some/path/file.prof +! RUN: %flang %s -o - -emit-llvm -S -fprofile-use=%t.dir/some/path/file.prof | FileCheck -check-prefix=PROFILE-USE-IR2 %s +! PROFILE-USE-IR1: = !{!"branch_weights", i32 100, i32 1} +! PROFILE-USE-IR2: = !{!"branch_weights", i32 1, i32 100} + +program main + implicit none + integer :: i + integer :: X = 0 + + do i = 0, 99 + X = X + i + end do + +end program main diff --git a/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h b/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h index e8e70c0e126a9..f0168c0407884 100644 --- a/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h +++ b/llvm/include/llvm/Frontend/Driver/CodeGenOptions.h @@ -14,6 +14,7 @@ #define LLVM_FRONTEND_DRIVER_CODEGENOPTIONS_H #include "llvm/Support/Compiler.h" +#include namespace llvm { class Triple; @@ -51,6 +52,18 @@ enum class VectorLibrary { LLVM_ABI TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple, VectorLibrary Veclib); +enum ProfileInstrKind { + ProfileNone, // Profile instrumentation is turned off. + ProfileClangInstr, // Clang instrumentation to generate execution counts + // to use with PGO. + ProfileIRInstr, // IR level PGO instrumentation in LLVM. + ProfileCSIRInstr, // IR level PGO context sensitive instrumentation in LLVM. + ProfileIRSampleColdCov, // IR level sample pgo based cold function coverage + // instrumentation in LLVM. +}; + +// Default filename used for profile generation. +std::string getDefaultProfileGenName(); } // end namespace llvm::driver #endif diff --git a/llvm/lib/Frontend/Driver/CMakeLists.txt b/llvm/lib/Frontend/Driver/CMakeLists.txt index 23de4994a300d..9feee6fe6929b 100644 --- a/llvm/lib/Frontend/Driver/CMakeLists.txt +++ b/llvm/lib/Frontend/Driver/CMakeLists.txt @@ -12,4 +12,5 @@ add_llvm_component_library(LLVMFrontendDriver Core Support Analysis + Instrumentation ) diff --git a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp index 52080dea93c98..df884908845d2 100644 --- a/llvm/lib/Frontend/Driver/CodeGenOptions.cpp +++ b/llvm/lib/Frontend/Driver/CodeGenOptions.cpp @@ -8,8 +8,15 @@ #include "llvm/Frontend/Driver/CodeGenOptions.h" #include "llvm/Analysis/TargetLibraryInfo.h" +#include "llvm/ProfileData/InstrProfCorrelator.h" #include "llvm/TargetParser/Triple.h" +namespace llvm { +extern llvm::cl::opt DebugInfoCorrelate; +extern llvm::cl::opt + ProfileCorrelate; +} // namespace llvm + namespace llvm::driver { TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple, @@ -56,4 +63,10 @@ TargetLibraryInfoImpl *createTLII(const llvm::Triple &TargetTriple, return TLII; } +std::string getDefaultProfileGenName() { + return llvm::DebugInfoCorrelate || + llvm::ProfileCorrelate != InstrProfCorrelator::NONE + ? "default_%m.proflite" + : "default_%m.profraw"; +} } // namespace llvm::driver From f6bf3bd5e001918780e7b1e8fceeb02604d65783 Mon Sep 17 00:00:00 2001 From: Reid Kleckner Date: Fri, 13 Jun 2025 11:08:15 -0700 Subject: [PATCH 421/851] [bazel] Fix XeGpu deps for 5578bcbcfd25c (#144133) --- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 + 1 file changed, 1 insertion(+) diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel index 7bcb1d4ca883c..b62d5595fe941 100644 --- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel +++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel @@ -3506,6 +3506,7 @@ cc_library( ":LoopLikeInterface", ":MemRefDialect", ":Pass", + ":SCFTransforms", ":TransformUtils", ":VectorDialect", ":VectorTransforms", From 59388fb0b92d7efd5737efd6c7b6d5c82f1bc6a8 Mon Sep 17 00:00:00 2001 From: Alex MacLean Date: Fri, 13 Jun 2025 11:16:44 -0700 Subject: [PATCH 422/851] [InstCombine] Preserve NSW/NUW flags when folding const BOp with min/max (#143471) When folding `X Pred C2 ? X BOp C1 : C2 BOp C1` to `min/max(X, C2) BOp C1`, if NUW/NSW flags are present on `X BOp C1` and could be safely applied to `C2 BOp C1`, then they may be added on the BOp after the fold is complete. https://alive2.llvm.org/ce/z/n_3aNJ Preserving these flags can allow subsequent transforms to re-order the min/max and BOp, which in the case of NVPTX would allow for some potential future transformations which would improve instruction-selection. --- .../InstCombine/InstCombineInternal.h | 2 + .../InstCombine/InstCombineSelect.cpp | 36 ++++++-- .../InstCombine/canonicalize-const-to-bop.ll | 83 ++++++++++++++++++- 3 files changed, 110 insertions(+), 11 deletions(-) diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h index ce0e843437b53..8c9de862fe8f2 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -771,6 +771,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final Value *A, Value *B, Instruction &Outer, SelectPatternFlavor SPF2, Value *C); Instruction *foldSelectInstWithICmp(SelectInst &SI, ICmpInst *ICI); + Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal, + Value *FalseVal); Instruction *foldSelectValueEquivalence(SelectInst &SI, CmpInst &CI); bool replaceInInstruction(Value *V, Value *Old, Value *New, unsigned Depth = 0); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp index 320b827bdbe86..73ba0f78e8053 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp @@ -1879,9 +1879,9 @@ static Instruction *foldSelectICmpEq(SelectInst &SI, ICmpInst *ICI, /// Fold `X Pred C1 ? X BOp C2 : C1 BOp C2` to `min/max(X, C1) BOp C2`. /// This allows for better canonicalization. -static Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal, - Value *FalseVal, - IRBuilderBase &Builder) { +Value *InstCombinerImpl::foldSelectWithConstOpToBinOp(ICmpInst *Cmp, + Value *TrueVal, + Value *FalseVal) { Constant *C1, *C2, *C3; Value *X; CmpPredicate Predicate; @@ -1945,11 +1945,29 @@ static Value *foldSelectWithConstOpToBinOp(ICmpInst *Cmp, Value *TrueVal, return nullptr; } - Intrinsic::ID IntrinsicID = getMinMaxIntrinsic(SPF); - Value *Intrinsic = Builder.CreateBinaryIntrinsic(IntrinsicID, X, RHS); - return IsIntrinsic ? Builder.CreateBinaryIntrinsic(Opcode, Intrinsic, C2) - : Builder.CreateBinOp(Instruction::BinaryOps(Opcode), - Intrinsic, C2); + Intrinsic::ID MinMaxID = getMinMaxIntrinsic(SPF); + Value *MinMax = Builder.CreateBinaryIntrinsic(MinMaxID, X, RHS); + if (IsIntrinsic) + return Builder.CreateBinaryIntrinsic(Opcode, MinMax, C2); + + const auto BinOpc = Instruction::BinaryOps(Opcode); + Value *BinOp = Builder.CreateBinOp(BinOpc, MinMax, C2); + + // If we can attach no-wrap flags to the new instruction, do so if the + // old instruction had them and C1 BinOp C2 does not overflow. + if (Instruction *BinOpInst = dyn_cast(BinOp)) { + if (BinOpc == Instruction::Add || BinOpc == Instruction::Sub || + BinOpc == Instruction::Mul) { + Instruction *OldBinOp = cast(TrueVal); + if (OldBinOp->hasNoSignedWrap() && + willNotOverflow(BinOpc, RHS, C2, *BinOpInst, /*IsSigned=*/true)) + BinOpInst->setHasNoSignedWrap(); + if (OldBinOp->hasNoUnsignedWrap() && + willNotOverflow(BinOpc, RHS, C2, *BinOpInst, /*IsSigned=*/false)) + BinOpInst->setHasNoUnsignedWrap(); + } + } + return BinOp; } /// Visit a SelectInst that has an ICmpInst as its first operand. @@ -2027,7 +2045,7 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI, if (Value *V = foldAbsDiff(ICI, TrueVal, FalseVal, Builder)) return replaceInstUsesWith(SI, V); - if (Value *V = foldSelectWithConstOpToBinOp(ICI, TrueVal, FalseVal, Builder)) + if (Value *V = foldSelectWithConstOpToBinOp(ICI, TrueVal, FalseVal)) return replaceInstUsesWith(SI, V); return Changed ? &SI : nullptr; diff --git a/llvm/test/Transforms/InstCombine/canonicalize-const-to-bop.ll b/llvm/test/Transforms/InstCombine/canonicalize-const-to-bop.ll index c08ec1bb7de0d..b3093a92624ae 100644 --- a/llvm/test/Transforms/InstCombine/canonicalize-const-to-bop.ll +++ b/llvm/test/Transforms/InstCombine/canonicalize-const-to-bop.ll @@ -5,7 +5,7 @@ define i8 @add_and_sgt(i8 %x) { ; CHECK-LABEL: define i8 @add_and_sgt( ; CHECK-SAME: i8 [[X:%.*]]) { ; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 8) -; CHECK-NEXT: [[S:%.*]] = add nuw i8 [[TMP1]], 16 +; CHECK-NEXT: [[S:%.*]] = add nuw nsw i8 [[TMP1]], 16 ; CHECK-NEXT: ret i8 [[S]] ; %add = add nsw i8 %x, 16 @@ -155,7 +155,7 @@ define i8 @multi_use_cond_and_sel(i8 %x) { ; CHECK-NEXT: [[CMP:%.*]] = icmp sgt i8 [[X]], 8 ; CHECK-NEXT: call void @use(i1 [[CMP]]) ; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.smax.i8(i8 [[X]], i8 8) -; CHECK-NEXT: [[S:%.*]] = add nuw i8 [[TMP1]], 16 +; CHECK-NEXT: [[S:%.*]] = add nuw nsw i8 [[TMP1]], 16 ; CHECK-NEXT: call void @use_byte(i8 [[S]]) ; CHECK-NEXT: ret i8 [[S]] ; @@ -450,3 +450,82 @@ define i8 @umax_sgt(i8 %x) { %s = select i1 %cmp, i8 100, i8 %umax ret i8 %s } + +define i8 @add_sgt_nuw_nsw_safe(i8 %x) { +; CHECK-LABEL: define i8 @add_sgt_nuw_nsw_safe( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 100) +; CHECK-NEXT: [[S:%.*]] = add nuw nsw i8 [[TMP1]], 1 +; CHECK-NEXT: ret i8 [[S]] +; + %add = add nuw nsw i8 %x, 1 + %cmp = icmp sgt i8 %x, 100 + %s = select i1 %cmp, i8 101, i8 %add + ret i8 %s +} + +define i8 @add_sgt_nuw_only(i8 %x) { +; CHECK-LABEL: define i8 @add_sgt_nuw_only( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 100) +; CHECK-NEXT: [[S:%.*]] = add nuw i8 [[TMP1]], 50 +; CHECK-NEXT: ret i8 [[S]] +; + %add = add nuw nsw i8 %x, 50 + %cmp = icmp sgt i8 %x, 100 + %s = select i1 %cmp, i8 150, i8 %add + ret i8 %s +} + +define i8 @add_sgt_nsw_only(i8 %x) { +; CHECK-LABEL: define i8 @add_sgt_nsw_only( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.smin.i8(i8 [[X]], i8 100) +; CHECK-NEXT: [[S:%.*]] = add nsw i8 [[TMP1]], -99 +; CHECK-NEXT: ret i8 [[S]] +; + %add = add nuw nsw i8 %x, -99 + %cmp = icmp sgt i8 %x, 100 + %s = select i1 %cmp, i8 1, i8 %add + ret i8 %s +} + + +define i8 @mul_ult_nuw_nsw_safe(i8 %x) { +; CHECK-LABEL: define i8 @mul_ult_nuw_nsw_safe( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 10) +; CHECK-NEXT: [[S:%.*]] = mul nuw nsw i8 [[TMP1]], 3 +; CHECK-NEXT: ret i8 [[S]] +; + %mul = mul nuw nsw i8 %x, 3 + %cmp = icmp ult i8 %x, 10 + %s = select i1 %cmp, i8 30, i8 %mul + ret i8 %s +} + +define i8 @mul_ult_nuw_only(i8 %x) { +; CHECK-LABEL: define i8 @mul_ult_nuw_only( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 10) +; CHECK-NEXT: [[S:%.*]] = mul nuw i8 [[TMP1]], 25 +; CHECK-NEXT: ret i8 [[S]] +; + %mul = mul nuw nsw i8 %x, 25 + %cmp = icmp ult i8 %x, 10 + %s = select i1 %cmp, i8 250, i8 %mul + ret i8 %s +} + +define i8 @mul_ult_nsw_only(i8 %x) { +; CHECK-LABEL: define i8 @mul_ult_nsw_only( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[TMP1:%.*]] = call i8 @llvm.umax.i8(i8 [[X]], i8 40) +; CHECK-NEXT: [[S:%.*]] = mul nsw i8 [[TMP1]], -2 +; CHECK-NEXT: ret i8 [[S]] +; + %mul = mul nuw nsw i8 %x, -2 + %cmp = icmp ult i8 %x, 40 + %s = select i1 %cmp, i8 -80, i8 %mul + ret i8 %s +} From f68848015f62156b8c3539b44f16d9c8b0a93a89 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Fri, 13 Jun 2025 19:17:01 +0100 Subject: [PATCH 423/851] [VPlan] Manage Sentinel value for FindLastIV in VPlan. (#142291) Similar to modeling the start value as operand, also model the sentinel value as operand explicitly. This makes all require information for code-gen available directly in VPlan. PR: https://github.com/llvm/llvm-project/pull/142291 --- .../Transforms/Vectorize/LoopVectorize.cpp | 20 +++++++++++-------- .../Transforms/Vectorize/VPlanPatternMatch.h | 19 ++++++++++++++++++ .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 8 ++++---- llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp | 2 +- .../vplan-printing-reductions.ll | 2 +- 5 files changed, 37 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp index fa313243a57da..69b60c7b93208 100644 --- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7266,9 +7266,11 @@ static void fixReductionScalarResumeWhenVectorizingEpilog( using namespace llvm::PatternMatch; Value *Cmp, *OrigResumeV, *CmpOp; bool IsExpectedPattern = - match(MainResumeValue, m_Select(m_OneUse(m_Value(Cmp)), - m_Specific(RdxDesc.getSentinelValue()), - m_Value(OrigResumeV))) && + match(MainResumeValue, + m_Select( + m_OneUse(m_Value(Cmp)), + m_Specific(EpiRedResult->getOperand(2)->getLiveInIRValue()), + m_Value(OrigResumeV))) && (match(Cmp, m_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(OrigResumeV), m_Value(CmpOp))) && ((CmpOp == StartV && isGuaranteedNotToBeUndefOrPoison(CmpOp)))); @@ -9235,9 +9237,11 @@ void LoopVectorizationPlanner::adjustRecipesForReductions( if (RecurrenceDescriptor::isFindLastIVRecurrenceKind( RdxDesc.getRecurrenceKind())) { VPValue *Start = PhiR->getStartValue(); - FinalReductionResult = - Builder.createNaryOp(VPInstruction::ComputeFindLastIVResult, - {PhiR, Start, NewExitingVPV}, ExitDL); + FinalReductionResult = Builder.createNaryOp( + VPInstruction::ComputeFindLastIVResult, + {PhiR, Start, Plan->getOrAddLiveIn(RdxDesc.getSentinelValue()), + NewExitingVPV}, + ExitDL); } else if (RecurrenceDescriptor::isAnyOfRecurrenceKind( RdxDesc.getRecurrenceKind())) { VPValue *Start = PhiR->getStartValue(); @@ -9825,8 +9829,8 @@ preparePlanForEpilogueVectorLoop(VPlan &Plan, Loop *L, BasicBlock *ResumeBB = cast(ResumeV)->getParent(); IRBuilder<> Builder(ResumeBB, ResumeBB->getFirstNonPHIIt()); Value *Cmp = Builder.CreateICmpEQ(ResumeV, ToFrozen[StartV]); - ResumeV = - Builder.CreateSelect(Cmp, RdxDesc.getSentinelValue(), ResumeV); + ResumeV = Builder.CreateSelect( + Cmp, RdxResult->getOperand(2)->getLiveInIRValue(), ResumeV); } else { VPValue *StartVal = Plan.getOrAddLiveIn(ResumeV); auto *PhiR = dyn_cast(&R); diff --git a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h index dfd9fc3d4d719..b2535fe3aa578 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h +++ b/llvm/lib/Transforms/Vectorize/VPlanPatternMatch.h @@ -318,6 +318,25 @@ m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2) { {Op0, Op1, Op2}); } +template +using Recipe4Op_match = Recipe_match, + Opcode, Commutative, RecipeTys...>; + +template +using VPInstruction4Op_match = + Recipe4Op_match; + +template +inline VPInstruction4Op_match +m_VPInstruction(const Op0_t &Op0, const Op1_t &Op1, const Op2_t &Op2, + const Op3_t &Op3) { + return VPInstruction4Op_match( + {Op0, Op1, Op2, Op3}); +} template inline UnaryVPInstruction_match m_Freeze(const Op0_t &Op0) { diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index ccce0e07e4d0a..d59cec892d405 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -645,16 +645,16 @@ Value *VPInstruction::generate(VPTransformState &State) { // The recipe's operands are the reduction phi, followed by one operand for // each part of the reduction. - unsigned UF = getNumOperands() - 2; - Value *ReducedPartRdx = State.get(getOperand(2)); + unsigned UF = getNumOperands() - 3; + Value *ReducedPartRdx = State.get(getOperand(3)); for (unsigned Part = 1; Part < UF; ++Part) { ReducedPartRdx = createMinMaxOp(Builder, RecurKind::SMax, ReducedPartRdx, - State.get(getOperand(2 + Part))); + State.get(getOperand(3 + Part))); } return createFindLastIVReduction(Builder, ReducedPartRdx, State.get(getOperand(1), true), - RdxDesc.getSentinelValue()); + getOperand(2)->getLiveInIRValue()); } case VPInstruction::ComputeReductionResult: { // FIXME: The cross-recipe dependency on VPReductionPHIRecipe is temporary diff --git a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp index e4c068ef175bc..dfb5bfabd22b8 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanUnroll.cpp @@ -347,7 +347,7 @@ void UnrollState::unrollBlock(VPBlockBase *VPB) { match(&R, m_VPInstruction( m_VPValue(), m_VPValue(Op1))) || match(&R, m_VPInstruction( - m_VPValue(), m_VPValue(), m_VPValue(Op1)))) { + m_VPValue(), m_VPValue(), m_VPValue(), m_VPValue(Op1)))) { addUniformForAllParts(cast(&R)); for (unsigned Part = 1; Part != UF; ++Part) R.addOperand(getValueForPart(Op1, Part)); diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index 95fbc4260587a..978f1b80d26da 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -240,7 +240,7 @@ define i64 @find_last_iv(ptr %a, i64 %n, i64 %start) { ; CHECK-NEXT: Successor(s): middle.block ; CHECK-EMPTY: ; CHECK-NEXT: middle.block: -; CHECK-NEXT: EMIT vp<[[RDX_RES:%.+]]> = compute-find-last-iv-result ir<%rdx>, ir<%start>, ir<%cond> +; CHECK-NEXT: EMIT vp<[[RDX_RES:%.+]]> = compute-find-last-iv-result ir<%rdx>, ir<%start>, ir<-9223372036854775808>, ir<%cond> ; CHECK-NEXT: EMIT vp<[[EXT:%.+]]> = extract-last-element vp<[[RDX_RES]]> ; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%n>, vp<{{.+}}> ; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> From 24bbc820701b49ab8bc7b9670034e39e11da8a16 Mon Sep 17 00:00:00 2001 From: Andy Kaylor Date: Fri, 13 Jun 2025 11:20:32 -0700 Subject: [PATCH 424/851] [CIR] Support for static variables (#143980) This adds support for emitting static variables and their initializers. --- .../CIR/Dialect/Builder/CIRBaseBuilder.h | 14 + clang/lib/CIR/CodeGen/CIRGenBuilder.h | 18 ++ clang/lib/CIR/CodeGen/CIRGenDecl.cpp | 248 +++++++++++++++++- clang/lib/CIR/CodeGen/CIRGenFunction.h | 6 + clang/lib/CIR/CodeGen/CIRGenModule.h | 13 + clang/test/CIR/CodeGen/static-vars.c | 37 +++ clang/test/CIR/CodeGen/static-vars.cpp | 49 ++++ 7 files changed, 383 insertions(+), 2 deletions(-) create mode 100644 clang/test/CIR/CodeGen/static-vars.c create mode 100644 clang/test/CIR/CodeGen/static-vars.cpp diff --git a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h index a3754f4de66b0..502d58d7db8b5 100644 --- a/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h +++ b/clang/include/clang/CIR/Dialect/Builder/CIRBaseBuilder.h @@ -185,11 +185,25 @@ class CIRBaseBuilderTy : public mlir::OpBuilder { global.getSymName()); } + mlir::Value createGetGlobal(cir::GlobalOp global) { + return createGetGlobal(global.getLoc(), global); + } + cir::StoreOp createStore(mlir::Location loc, mlir::Value val, mlir::Value dst, mlir::IntegerAttr align = {}) { return create(loc, val, dst, align); } + [[nodiscard]] cir::GlobalOp createGlobal(mlir::ModuleOp mlirModule, + mlir::Location loc, + mlir::StringRef name, + mlir::Type type, + cir::GlobalLinkageKind linkage) { + mlir::OpBuilder::InsertionGuard guard(*this); + setInsertionPointToStart(mlirModule.getBody()); + return create(loc, name, type, linkage); + } + cir::GetMemberOp createGetMember(mlir::Location loc, mlir::Type resultTy, mlir::Value base, llvm::StringRef name, unsigned index) { diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.h b/clang/lib/CIR/CodeGen/CIRGenBuilder.h index a4bc69619d60c..adf7cb77f1a5d 100644 --- a/clang/lib/CIR/CodeGen/CIRGenBuilder.h +++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.h @@ -24,6 +24,7 @@ namespace clang::CIRGen { class CIRGenBuilderTy : public cir::CIRBaseBuilderTy { const CIRGenTypeCache &typeCache; llvm::StringMap recordNames; + llvm::StringMap globalsVersioning; public: CIRGenBuilderTy(mlir::MLIRContext &mlirContext, const CIRGenTypeCache &tc) @@ -371,6 +372,23 @@ class CIRGenBuilderTy : public cir::CIRBaseBuilderTy { /// pointed to by \p arrayPtr. mlir::Value maybeBuildArrayDecay(mlir::Location loc, mlir::Value arrayPtr, mlir::Type eltTy); + + /// Creates a versioned global variable. If the symbol is already taken, an ID + /// will be appended to the symbol. The returned global must always be queried + /// for its name so it can be referenced correctly. + [[nodiscard]] cir::GlobalOp + createVersionedGlobal(mlir::ModuleOp module, mlir::Location loc, + mlir::StringRef name, mlir::Type type, + cir::GlobalLinkageKind linkage) { + // Create a unique name if the given name is already taken. + std::string uniqueName; + if (unsigned version = globalsVersioning[name.str()]++) + uniqueName = name.str() + "." + std::to_string(version); + else + uniqueName = name.str(); + + return createGlobal(module, loc, uniqueName, type, linkage); + } }; } // namespace clang::CIRGen diff --git a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp index 748c2b5f6fceb..1941b5066edb4 100644 --- a/clang/lib/CIR/CodeGen/CIRGenDecl.cpp +++ b/clang/lib/CIR/CodeGen/CIRGenDecl.cpp @@ -208,8 +208,25 @@ void CIRGenFunction::emitVarDecl(const VarDecl &d) { if (d.hasExternalStorage()) return; - if (d.getStorageDuration() != SD_Automatic) - cgm.errorNYI(d.getSourceRange(), "emitVarDecl automatic storage duration"); + if (d.getStorageDuration() != SD_Automatic) { + // Static sampler variables translated to function calls. + if (d.getType()->isSamplerT()) { + // Nothing needs to be done here, but let's flag it as an error until we + // have a test. It requires OpenCL support. + cgm.errorNYI(d.getSourceRange(), "emitVarDecl static sampler type"); + return; + } + + cir::GlobalLinkageKind linkage = + cgm.getCIRLinkageVarDefinition(&d, /*IsConstant=*/false); + + // FIXME: We need to force the emission/use of a guard variable for + // some variables even if we can constant-evaluate them because + // we can't guarantee every translation unit will constant-evaluate them. + + return emitStaticVarDecl(d, linkage); + } + if (d.getType().getAddressSpace() == LangAS::opencl_local) cgm.errorNYI(d.getSourceRange(), "emitVarDecl openCL address space"); @@ -219,6 +236,233 @@ void CIRGenFunction::emitVarDecl(const VarDecl &d) { return emitAutoVarDecl(d); } +static std::string getStaticDeclName(CIRGenModule &cgm, const VarDecl &d) { + if (cgm.getLangOpts().CPlusPlus) + return cgm.getMangledName(&d).str(); + + // If this isn't C++, we don't need a mangled name, just a pretty one. + assert(!d.isExternallyVisible() && "name shouldn't matter"); + std::string contextName; + const DeclContext *dc = d.getDeclContext(); + if (auto *cd = dyn_cast(dc)) + dc = cast(cd->getNonClosureContext()); + if (const auto *fd = dyn_cast(dc)) + contextName = std::string(cgm.getMangledName(fd)); + else if (isa(dc)) + cgm.errorNYI(d.getSourceRange(), "block decl context for static var"); + else if (isa(dc)) + cgm.errorNYI(d.getSourceRange(), "ObjC decl context for static var"); + else + cgm.errorNYI(d.getSourceRange(), "Unknown context for static var decl"); + + contextName += "." + d.getNameAsString(); + return contextName; +} + +// TODO(cir): LLVM uses a Constant base class. Maybe CIR could leverage an +// interface for all constants? +cir::GlobalOp +CIRGenModule::getOrCreateStaticVarDecl(const VarDecl &d, + cir::GlobalLinkageKind linkage) { + // In general, we don't always emit static var decls once before we reference + // them. It is possible to reference them before emitting the function that + // contains them, and it is possible to emit the containing function multiple + // times. + if (cir::GlobalOp existingGV = getStaticLocalDeclAddress(&d)) + return existingGV; + + QualType ty = d.getType(); + assert(ty->isConstantSizeType() && "VLAs can't be static"); + + // Use the label if the variable is renamed with the asm-label extension. + if (d.hasAttr()) + errorNYI(d.getSourceRange(), "getOrCreateStaticVarDecl: asm label"); + + std::string name = getStaticDeclName(*this, d); + + mlir::Type lty = getTypes().convertTypeForMem(ty); + assert(!cir::MissingFeatures::addressSpace()); + + if (d.hasAttr() || d.hasAttr()) + errorNYI(d.getSourceRange(), + "getOrCreateStaticVarDecl: LoaderUninitializedAttr"); + assert(!cir::MissingFeatures::addressSpace()); + + mlir::Attribute init = builder.getZeroInitAttr(convertType(ty)); + + cir::GlobalOp gv = builder.createVersionedGlobal( + getModule(), getLoc(d.getLocation()), name, lty, linkage); + // TODO(cir): infer visibility from linkage in global op builder. + gv.setVisibility(getMLIRVisibilityFromCIRLinkage(linkage)); + gv.setInitialValueAttr(init); + gv.setAlignment(getASTContext().getDeclAlign(&d).getAsAlign().value()); + + if (supportsCOMDAT() && gv.isWeakForLinker()) + gv.setComdat(true); + + assert(!cir::MissingFeatures::opGlobalThreadLocal()); + + setGVProperties(gv, &d); + + // OG checks if the expected address space, denoted by the type, is the + // same as the actual address space indicated by attributes. If they aren't + // the same, an addrspacecast is emitted when this variable is accessed. + // In CIR however, cir.get_global already carries that information in + // !cir.ptr type - if this global is in OpenCL local address space, then its + // type would be !cir.ptr<..., addrspace(offload_local)>. Therefore we don't + // need an explicit address space cast in CIR: they will get emitted when + // lowering to LLVM IR. + + // Ensure that the static local gets initialized by making sure the parent + // function gets emitted eventually. + const Decl *dc = cast(d.getDeclContext()); + + // We can't name blocks or captured statements directly, so try to emit their + // parents. + if (isa(dc) || isa(dc)) { + dc = dc->getNonClosureContext(); + // FIXME: Ensure that global blocks get emitted. + if (!dc) + errorNYI(d.getSourceRange(), "non-closure context"); + } + + GlobalDecl gd; + if (isa(dc)) + errorNYI(d.getSourceRange(), "C++ constructors static var context"); + else if (isa(dc)) + errorNYI(d.getSourceRange(), "C++ destructors static var context"); + else if (const auto *fd = dyn_cast(dc)) + gd = GlobalDecl(fd); + else { + // Don't do anything for Obj-C method decls or global closures. We should + // never defer them. + assert(isa(dc) && "unexpected parent code decl"); + } + if (gd.getDecl() && cir::MissingFeatures::openMP()) { + // Disable emission of the parent function for the OpenMP device codegen. + errorNYI(d.getSourceRange(), "OpenMP"); + } + + return gv; +} + +/// Add the initializer for 'd' to the global variable that has already been +/// created for it. If the initializer has a different type than gv does, this +/// may free gv and return a different one. Otherwise it just returns gv. +cir::GlobalOp CIRGenFunction::addInitializerToStaticVarDecl( + const VarDecl &d, cir::GlobalOp gv, cir::GetGlobalOp gvAddr) { + ConstantEmitter emitter(*this); + mlir::TypedAttr init = + mlir::cast(emitter.tryEmitForInitializer(d)); + + // If constant emission failed, then this should be a C++ static + // initializer. + if (!init) { + cgm.errorNYI(d.getSourceRange(), "static var without initializer"); + return gv; + } + + // TODO(cir): There should be debug code here to assert that the decl size + // matches the CIR data layout type alloc size, but the code for calculating + // the type alloc size is not implemented yet. + assert(!cir::MissingFeatures::dataLayoutTypeAllocSize()); + + // The initializer may differ in type from the global. Rewrite + // the global to match the initializer. (We have to do this + // because some types, like unions, can't be completely represented + // in the LLVM type system.) + if (gv.getSymType() != init.getType()) { + gv.setSymType(init.getType()); + + // Normally this should be done with a call to cgm.replaceGlobal(oldGV, gv), + // but since at this point the current block hasn't been really attached, + // there's no visibility into the GetGlobalOp corresponding to this Global. + // Given those constraints, thread in the GetGlobalOp and update it + // directly. + assert(!cir::MissingFeatures::addressSpace()); + gvAddr.getAddr().setType(builder.getPointerTo(init.getType())); + } + + bool needsDtor = + d.needsDestruction(getContext()) == QualType::DK_cxx_destructor; + + assert(!cir::MissingFeatures::opGlobalConstant()); + gv.setInitialValueAttr(init); + + emitter.finalize(gv); + + if (needsDtor) { + // We have a constant initializer, but a nontrivial destructor. We still + // need to perform a guarded "initialization" in order to register the + // destructor. + cgm.errorNYI(d.getSourceRange(), "C++ guarded init"); + } + + return gv; +} + +void CIRGenFunction::emitStaticVarDecl(const VarDecl &d, + cir::GlobalLinkageKind linkage) { + // Check to see if we already have a global variable for this + // declaration. This can happen when double-emitting function + // bodies, e.g. with complete and base constructors. + cir::GlobalOp globalOp = cgm.getOrCreateStaticVarDecl(d, linkage); + // TODO(cir): we should have a way to represent global ops as values without + // having to emit a get global op. Sometimes these emissions are not used. + mlir::Value addr = builder.createGetGlobal(globalOp); + auto getAddrOp = mlir::cast(addr.getDefiningOp()); + + CharUnits alignment = getContext().getDeclAlign(&d); + + // Store into LocalDeclMap before generating initializer to handle + // circular references. + mlir::Type elemTy = convertTypeForMem(d.getType()); + setAddrOfLocalVar(&d, Address(addr, elemTy, alignment)); + + // We can't have a VLA here, but we can have a pointer to a VLA, + // even though that doesn't really make any sense. + // Make sure to evaluate VLA bounds now so that we have them for later. + if (d.getType()->isVariablyModifiedType()) { + cgm.errorNYI(d.getSourceRange(), + "emitStaticVarDecl: variably modified type"); + } + + // Save the type in case adding the initializer forces a type change. + mlir::Type expectedType = addr.getType(); + + cir::GlobalOp var = globalOp; + + assert(!cir::MissingFeatures::cudaSupport()); + + // If this value has an initializer, emit it. + if (d.getInit()) + var = addInitializerToStaticVarDecl(d, var, getAddrOp); + + var.setAlignment(alignment.getAsAlign().value()); + + // There are a lot of attributes that need to be handled here. Until + // we start to support them, we just report an error if there are any. + if (d.hasAttrs()) + cgm.errorNYI(d.getSourceRange(), "static var with attrs"); + + if (cgm.getCodeGenOpts().KeepPersistentStorageVariables) + cgm.errorNYI(d.getSourceRange(), "static var keep persistent storage"); + + // From traditional codegen: + // We may have to cast the constant because of the initializer + // mismatch above. + // + // FIXME: It is really dangerous to store this in the map; if anyone + // RAUW's the GV uses of this constant will be invalid. + mlir::Value castedAddr = + builder.createBitcast(getAddrOp.getAddr(), expectedType); + localDeclMap.find(&d)->second = Address(castedAddr, elemTy, alignment); + cgm.setStaticLocalDeclAddress(&d, var); + + assert(!cir::MissingFeatures::sanitizers()); + assert(!cir::MissingFeatures::generateDebugInfo()); +} + void CIRGenFunction::emitScalarInit(const Expr *init, mlir::Location loc, LValue lvalue, bool capturedByInit) { assert(!cir::MissingFeatures::objCLifetime()); diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h index 9421ea26a429f..318d3fbf3f9e1 100644 --- a/clang/lib/CIR/CodeGen/CIRGenFunction.h +++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h @@ -469,6 +469,10 @@ class CIRGenFunction : public CIRGenTypeCache { /// compare the result against zero, returning an Int1Ty value. mlir::Value evaluateExprAsBool(const clang::Expr *e); + cir::GlobalOp addInitializerToStaticVarDecl(const VarDecl &d, + cir::GlobalOp gv, + cir::GetGlobalOp gvAddr); + /// Set the address of a local variable. void setAddrOfLocalVar(const clang::VarDecl *vd, Address addr) { assert(!localDeclMap.count(vd) && "Decl already exists in LocalDeclMap!"); @@ -955,6 +959,8 @@ class CIRGenFunction : public CIRGenTypeCache { void emitScalarInit(const clang::Expr *init, mlir::Location loc, LValue lvalue, bool capturedByInit = false); + void emitStaticVarDecl(const VarDecl &d, cir::GlobalLinkageKind linkage); + void emitStoreOfScalar(mlir::Value value, Address addr, bool isVolatile, clang::QualType ty, bool isInit = false, bool isNontemporal = false); diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h index f76fd8e733642..03606dba200fd 100644 --- a/clang/lib/CIR/CodeGen/CIRGenModule.h +++ b/clang/lib/CIR/CodeGen/CIRGenModule.h @@ -113,8 +113,21 @@ class CIRGenModule : public CIRGenTypeCache { mlir::Operation *lastGlobalOp = nullptr; + llvm::DenseMap staticLocalDeclMap; + mlir::Operation *getGlobalValue(llvm::StringRef ref); + cir::GlobalOp getStaticLocalDeclAddress(const VarDecl *d) { + return staticLocalDeclMap[d]; + } + + void setStaticLocalDeclAddress(const VarDecl *d, cir::GlobalOp c) { + staticLocalDeclMap[d] = c; + } + + cir::GlobalOp getOrCreateStaticVarDecl(const VarDecl &d, + cir::GlobalLinkageKind linkage); + /// If the specified mangled name is not in the module, create and return an /// mlir::GlobalOp value cir::GlobalOp getOrCreateCIRGlobal(llvm::StringRef mangledName, mlir::Type ty, diff --git a/clang/test/CIR/CodeGen/static-vars.c b/clang/test/CIR/CodeGen/static-vars.c new file mode 100644 index 0000000000000..f45a41d9a00fc --- /dev/null +++ b/clang/test/CIR/CodeGen/static-vars.c @@ -0,0 +1,37 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s + +void func1(void) { + // Should lower default-initialized static vars. + static int i; + // CHECK-DAG: cir.global "private" internal dsolocal @func1.i = #cir.int<0> : !s32i + + // Should lower constant-initialized static vars. + static int j = 1; + // CHECK-DAG: cir.global "private" internal dsolocal @func1.j = #cir.int<1> : !s32i + + // Should properly shadow static vars in nested scopes. + { + static int j = 2; + // CHECK-DAG: cir.global "private" internal dsolocal @func1.j.1 = #cir.int<2> : !s32i + } + { + static int j = 3; + // CHECK-DAG: cir.global "private" internal dsolocal @func1.j.2 = #cir.int<3> : !s32i + } + + // Should lower basic static vars arithmetics. + j++; + // CHECK-DAG: %[[#V2:]] = cir.get_global @func1.j : !cir.ptr + // CHECK-DAG: %[[#V3:]] = cir.load{{.*}} %[[#V2]] : !cir.ptr, !s32i + // CHECK-DAG: %[[#V4:]] = cir.unary(inc, %[[#V3]]) nsw : !s32i, !s32i + // CHECK-DAG: cir.store{{.*}} %[[#V4]], %[[#V2]] : !s32i, !cir.ptr +} + +// Should shadow static vars on different functions. +void func2(void) { + static char i; + // CHECK-DAG: cir.global "private" internal dsolocal @func2.i = #cir.int<0> : !s8i + static float j; + // CHECK-DAG: cir.global "private" internal dsolocal @func2.j = #cir.fp<0.000000e+00> : !cir.float +} diff --git a/clang/test/CIR/CodeGen/static-vars.cpp b/clang/test/CIR/CodeGen/static-vars.cpp new file mode 100644 index 0000000000000..9b892c69a6fed --- /dev/null +++ b/clang/test/CIR/CodeGen/static-vars.cpp @@ -0,0 +1,49 @@ +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-cir %s -o %t.cir +// RUN: FileCheck --input-file=%t.cir %s +// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t1.ll +// RUN: FileCheck --check-prefix=LLVM --input-file=%t1.ll %s + +void func1(void) { + // Should lower default-initialized static vars. + static int i; + // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func1vE1i = #cir.int<0> : !s32i + + // Should lower constant-initialized static vars. + static int j = 1; + // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func1vE1j = #cir.int<1> : !s32i + + // Should properly shadow static vars in nested scopes. + { + static int j = 2; + // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func1vE1j_0 = #cir.int<2> : !s32i + } + { + static int j = 3; + // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func1vE1j_1 = #cir.int<3> : !s32i + } + + // Should lower basic static vars arithmetics. + j++; + // CHECK-DAG: %[[#V2:]] = cir.get_global @_ZZ5func1vE1j : !cir.ptr + // CHECK-DAG: %[[#V3:]] = cir.load{{.*}} %[[#V2]] : !cir.ptr, !s32i + // CHECK-DAG: %[[#V4:]] = cir.unary(inc, %[[#V3]]) nsw : !s32i, !s32i + // CHECK-DAG: cir.store{{.*}} %[[#V4]], %[[#V2]] : !s32i, !cir.ptr +} + +// Should shadow static vars on different functions. +void func2(void) { + static char i; + // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func2vE1i = #cir.int<0> : !s8i + static float j; + // CHECK-DAG: cir.global "private" internal dsolocal @_ZZ5func2vE1j = #cir.fp<0.000000e+00> : !cir.float +} + +// CHECK-DAG: cir.global linkonce_odr comdat @_ZZ4testvE1c = #cir.int<0> : !s32i + +// LLVM-DAG: $_ZZ4testvE1c = comdat any +// LLVM-DAG: @_ZZ4testvE1c = linkonce_odr global i32 0, comdat, align 4 + +inline void test() { static int c; } +// CHECK-LABEL: @_Z4testv +// CHECK: {{%.*}} = cir.get_global @_ZZ4testvE1c : !cir.ptr +void foo() { test(); } From 79e06bf1ae9961c5045134288fd8acc9173f6be2 Mon Sep 17 00:00:00 2001 From: zGoldthorpe Date: Fri, 13 Jun 2025 12:22:21 -0600 Subject: [PATCH 425/851] [AMDGPU] Extended vector promotion to aggregate types. (#143784) Extends the `amdgpu-promote-alloca-to-vector` pass to also promote aggregate types whose elements are all the same type to vector registers. The motivation for this extension was to account for IR generated by the frontend containing several singleton struct types containing vectors or vector-like elements, though the implementation is strictly more general. --- .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 106 ++++--- .../CodeGen/AMDGPU/promote-alloca-structs.ll | 286 ++++++++++++++++++ 2 files changed, 351 insertions(+), 41 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 700dc87d2f821..e90a3a275f67c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -818,6 +818,39 @@ static BasicBlock::iterator skipToNonAllocaInsertPt(BasicBlock &BB, return I; } +/// Get the underlying type of a homogeneous aggregate type, or nullptr if the +/// type is non-homogeneous. +static Type *getHomogeneousType(Type *Ty) { + Type *ElemTy = nullptr; + SmallVector WorkList; + WorkList.push_back(Ty); + while (!WorkList.empty()) { + Type *CurTy = WorkList.pop_back_val(); + + // Check if the current type is an aggregate type. + if (auto *VectorTy = dyn_cast(CurTy)) { + WorkList.push_back(VectorTy->getElementType()); + continue; + } + if (auto *ArrayTy = dyn_cast(CurTy)) { + WorkList.push_back(ArrayTy->getElementType()); + continue; + } + if (auto *StructTy = dyn_cast(CurTy)) { + WorkList.append(StructTy->element_begin(), StructTy->element_end()); + continue; + } + + // If not, it must be the same as all other non-aggregate types. + if (!ElemTy) + ElemTy = CurTy; + else if (ElemTy != CurTy) + return nullptr; + } + + return ElemTy; +} + // FIXME: Should try to pick the most likely to be profitable allocas first. bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { LLVM_DEBUG(dbgs() << "Trying to promote to vector: " << Alloca << '\n'); @@ -828,42 +861,42 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { } Type *AllocaTy = Alloca.getAllocatedType(); - auto *VectorTy = dyn_cast(AllocaTy); - if (auto *ArrayTy = dyn_cast(AllocaTy)) { - uint64_t NumElems = 1; - Type *ElemTy; - do { - NumElems *= ArrayTy->getNumElements(); - ElemTy = ArrayTy->getElementType(); - } while ((ArrayTy = dyn_cast(ElemTy))); - - // Check for array of vectors - auto *InnerVectorTy = dyn_cast(ElemTy); - if (InnerVectorTy) { - NumElems *= InnerVectorTy->getNumElements(); - ElemTy = InnerVectorTy->getElementType(); - } + Type *ElemTy = getHomogeneousType(AllocaTy); - if (VectorType::isValidElementType(ElemTy) && NumElems > 0) { - unsigned ElementSize = DL->getTypeSizeInBits(ElemTy) / 8; - if (ElementSize > 0) { - unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy); - // Expand vector if required to match padding of inner type, - // i.e. odd size subvectors. - // Storage size of new vector must match that of alloca for correct - // behaviour of byte offsets and GEP computation. - if (NumElems * ElementSize != AllocaSize) - NumElems = AllocaSize / ElementSize; - if (NumElems > 0 && (AllocaSize % ElementSize) == 0) - VectorTy = FixedVectorType::get(ElemTy, NumElems); - } - } + if (!ElemTy || !VectorType::isValidElementType(ElemTy)) { + LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n"); + return false; } - if (!VectorTy) { - LLVM_DEBUG(dbgs() << " Cannot convert type to vector\n"); + unsigned ElementSizeInBits = DL->getTypeSizeInBits(ElemTy); + if (ElementSizeInBits != DL->getTypeAllocSizeInBits(ElemTy)) { + LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size " + "does not match the type's size\n"); + return false; + } + unsigned ElementSize = ElementSizeInBits / 8; + if (ElementSize == 0) { + LLVM_DEBUG(dbgs() << " Cannot create vector of zero-sized elements\n"); + return false; + } + + // Calculate the size of the corresponding vector, accounting for padding of + // inner types, e.g., odd-sized subvectors. Storage size of new vector must + // match that of alloca for correct behaviour of byte offsets and GEP + // computation. + unsigned AllocaSize = DL->getTypeStoreSize(AllocaTy); + unsigned NumElems = AllocaSize / ElementSize; + if (NumElems == 0) { + LLVM_DEBUG(dbgs() << " Cannot vectorize an empty aggregate type\n"); return false; } + if (NumElems * ElementSize != AllocaSize) { + LLVM_DEBUG( + dbgs() << " Cannot convert type into vector of the same size\n"); + return false; + } + auto *VectorTy = FixedVectorType::get(ElemTy, NumElems); + assert(VectorTy && "Failed to create vector type."); const unsigned MaxElements = (MaxVectorRegs * 32) / DL->getTypeSizeInBits(VectorTy->getElementType()); @@ -895,15 +928,6 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { LLVM_DEBUG(dbgs() << " Attempting promotion to: " << *VectorTy << "\n"); - Type *VecEltTy = VectorTy->getElementType(); - unsigned ElementSizeInBits = DL->getTypeSizeInBits(VecEltTy); - if (ElementSizeInBits != DL->getTypeAllocSizeInBits(VecEltTy)) { - LLVM_DEBUG(dbgs() << " Cannot convert to vector if the allocation size " - "does not match the type's size\n"); - return false; - } - unsigned ElementSize = ElementSizeInBits / 8; - assert(ElementSize > 0); for (auto *U : Uses) { Instruction *Inst = cast(U->getUser()); @@ -943,7 +967,7 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToVector(AllocaInst &Alloca) { if (auto *GEP = dyn_cast(Inst)) { // If we can't compute a vector index from this GEP, then we can't // promote this alloca to vector. - Value *Index = GEPToVectorIndex(GEP, &Alloca, VecEltTy, *DL, NewGEPInsts); + Value *Index = GEPToVectorIndex(GEP, &Alloca, ElemTy, *DL, NewGEPInsts); if (!Index) return RejectUser(Inst, "cannot compute vector index for GEP"); diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll new file mode 100644 index 0000000000000..1cdd027fef89d --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-structs.ll @@ -0,0 +1,286 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-promote-alloca-to-vector -amdgpu-promote-alloca-to-vector-limit=512 %s | FileCheck %s + +define i8 @test_v4i8(i32 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_v4i8( +; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca <4 x i8>, align 4, addrspace(5) + store i32 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_a4i8(i32 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_a4i8( +; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca [4 x i8], align 4, addrspace(5) + store i32 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_a2v4i8(i64 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_a2v4i8( +; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca [2 x <4 x i8>], align 4, addrspace(5) + store i64 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_a2v3i8(i64 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_a2v3i8( +; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca [2 x <3 x i8>], align 4, addrspace(5) + store i64 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_a2a4i8(i64 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_a2a4i8( +; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca [2 x [4 x i8]], align 4, addrspace(5) + store i64 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_a2a3i8(i48 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_a2a3i8( +; CHECK-SAME: i48 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <6 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i48 [[BITS]] to <6 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca [2 x [3 x i8]], align 4, addrspace(5) + store i48 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_s1v4i8(i32 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s1v4i8( +; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca {<4 x i8>}, align 4, addrspace(5) + store i32 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_s1a4i8(i32 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s1a4i8( +; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca {[4 x i8]}, align 4, addrspace(5) + store i32 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_s4i8(i32 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s4i8( +; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca {i8, i8, i8, i8}, align 4, addrspace(5) + store i32 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_s2v4i8(i64 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s2v4i8( +; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca {<4 x i8>, <4 x i8>}, align 4, addrspace(5) + store i64 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_s2v2i8v4i8(i64 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s2v2i8v4i8( +; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca {<2 x i8>, <4 x i8>}, align 4, addrspace(5) + store i64 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_s2v2i8v3i8(i64 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s2v2i8v3i8( +; CHECK-SAME: i64 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <8 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i64 [[BITS]] to <8 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <8 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca {<2 x i8>, <3 x i8>}, align 4, addrspace(5) + store i64 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_s2s2i8s4i8(i48 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s2s2i8s4i8( +; CHECK-SAME: i48 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <6 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i48 [[BITS]] to <6 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <6 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca {{i8, i8}, {i8, i8, i8, i8}}, align 4, addrspace(5) + store i48 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_s2s2i8s3i8(i40 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s2s2i8s3i8( +; CHECK-SAME: i40 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <5 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i40 [[BITS]] to <5 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <5 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca {{i8, i8}, {i8, i8, i8}}, align 4, addrspace(5) + store i40 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_s3i8s1i8v2i8(i32 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s3i8s1i8v2i8( +; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <4 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i32 [[BITS]] to <4 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca {i8, {i8}, <2 x i8>}, align 4, addrspace(5) + store i32 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +define i8 @test_s3i8i8s0(i16 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_s3i8i8s0( +; CHECK-SAME: i16 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = freeze <2 x i8> poison +; CHECK-NEXT: [[TMP1:%.*]] = bitcast i16 [[BITS]] to <2 x i8> +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i8> [[TMP1]], i64 [[IDX]] +; CHECK-NEXT: ret i8 [[TMP2]] +; + %stack = alloca {i8, i8, {}}, align 4, addrspace(5) + store i16 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +; heterogeneous element types are not supported +define i8 @test_heterogeneous(i32 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_heterogeneous( +; CHECK-SAME: i32 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = alloca { i8, i8, i16 }, align 4, addrspace(5) +; CHECK-NEXT: store i32 [[BITS]], ptr addrspace(5) [[STACK]], align 4 +; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]] +; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1 +; CHECK-NEXT: ret i8 [[VAL]] +; + %stack = alloca {i8, i8, i16}, align 4, addrspace(5) + store i32 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} + +; empty types are not supported +define void @test_empty() { +; CHECK-LABEL: define void @test_empty() { +; CHECK-NEXT: [[STACK:%.*]] = alloca {}, align 4, addrspace(5) +; CHECK-NEXT: ret void +; + %stack = alloca {}, align 4, addrspace(5) + ret void +} + +; singleton types are not supported +define i8 @test_singleton(i8 %bits, i64 %idx) { +; CHECK-LABEL: define i8 @test_singleton( +; CHECK-SAME: i8 [[BITS:%.*]], i64 [[IDX:%.*]]) { +; CHECK-NEXT: [[STACK:%.*]] = alloca { i8, {} }, align 4, addrspace(5) +; CHECK-NEXT: store i8 [[BITS]], ptr addrspace(5) [[STACK]], align 1 +; CHECK-NEXT: [[PTR:%.*]] = getelementptr inbounds i8, ptr addrspace(5) [[STACK]], i64 [[IDX]] +; CHECK-NEXT: [[VAL:%.*]] = load i8, ptr addrspace(5) [[PTR]], align 1 +; CHECK-NEXT: ret i8 [[VAL]] +; + %stack = alloca {i8, {}}, align 4, addrspace(5) + store i8 %bits, ptr addrspace(5) %stack + %ptr = getelementptr inbounds i8, ptr addrspace(5) %stack, i64 %idx + %val = load i8, ptr addrspace(5) %ptr, align 1 + ret i8 %val +} From a08de429e4ae0baaed23060cbae5c73dc6ffcc5d Mon Sep 17 00:00:00 2001 From: Nico Weber Date: Fri, 13 Jun 2025 14:46:54 -0400 Subject: [PATCH 426/851] [gn] port cc365331af42 --- llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn | 1 + 1 file changed, 1 insertion(+) diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn index fec917c25b190..ca05ac1b24647 100644 --- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn @@ -299,6 +299,7 @@ write_cmake_config("llvm-config") { "LLVM_ENABLE_TELEMETRY=", "LLVM_DEFAULT_TARGET_TRIPLE=$llvm_target_triple", "LLVM_ENABLE_DEBUGLOC_COVERAGE_TRACKING=", + "LLVM_ENABLE_DEBUGLOC_ORIGIN_TRACKING=", "LLVM_ENABLE_DUMP=", "LLVM_ENABLE_HTTPLIB=", "LLVM_FORCE_USE_OLD_TOOLCHAIN=", From 2f1e6eb6c3e731266052536c3f98cce3a71a316e Mon Sep 17 00:00:00 2001 From: yonghong-song Date: Fri, 13 Jun 2025 11:58:48 -0700 Subject: [PATCH 427/851] [BPF] Report an warning if certain insn imm operand cannot fit in 32bit (#142989) Ihor Solodrai reported a case ([1]) where gcc reports an error but clang ignores that error and proceeds to generate incorrect code. More specifically, the problematic code looks like: if r1 == 0xcafefeeddeadbeef goto